In [138]:
import pandas as pd
import numpy as np
import re, string
from sklearn.model_selection import GridSearchCV

In [3]:
pretrained = "data\\crawl-300d-2M.vec"

In [5]:
word_vector = get_pretrained(pretrained)

In [4]:
train = pd.read_csv("data\\train.csv").fillna(' ')
test = pd.read_csv("data\\test.csv").fillna(' ')

In [10]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [11]:
train_features = train['comment_text'].apply(tokenize)

In [13]:
test_features = test['comment_text'].apply(tokenize)

In [69]:
from nlp_pipeline import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [70]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq]
transforms = [tokenize]
logreg = LogisticRegression(C=0.2, class_weight='balanced', solver='newton-cg', max_iter=10)
logreg.name = "Logistic regression newton"
models = [logreg]

In [71]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_vectors=word_vector, pretrained=pretrained)
print(pipe)

Train: (159571, 8)
Test: (153164, 2)
Train features: (0,)
Test features: (0,)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 0.2 balanced False True 1 10 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  function asterix_freq uppercase_freq line_change_freq rep_freq question_freq
Metric: roc_auc
CV scores: {'Logistic regression newton': -1}


In [107]:
pipe

Train: (159571, 8)
Test: (153164, 2)
Train features: (159571, 6)
Test features: (153164, 6)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 0.2 balanced False True 1 10 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  function asterix_freq uppercase_freq line_change_freq rep_freq question_freq
Metric: roc_auc
CV scores: {'Logistic regression newton': -1}

In [108]:
pipe.apply_transforms()

Applying transforms


In [111]:
pipe.create_embeddings()

Creating embeddings


In [112]:
import lightgbm as lgb

In [162]:
lgb_train = lgb.Dataset(pipe.train_features[:130000], train["toxic"][:130000])
lgb_eval = lgb.Dataset(pipe.train_features[130000:], train["toxic"][130000:], reference=lgb_train)

In [181]:
print("Starting grid search")
for lr in [0.1,0.2]:
    print("Learning rate:", lr, "Number of leaves:", nl)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 31,
        'learning_rate': lr,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0,
        'reg_lambda': 0.5
    }
    gbm = lgb.train(params,
            lgb_train,
            num_boost_round=200,
            valid_sets=lgb_eval,
            early_stopping_rounds=5)
    print(gbm.best_score)

Starting grid search
Learning rate: 0.1 Number of leaves: 50
[1]	valid_0's auc: 0.851814
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.884587
[3]	valid_0's auc: 0.889251
[4]	valid_0's auc: 0.893293
[5]	valid_0's auc: 0.895416
[6]	valid_0's auc: 0.900161
[7]	valid_0's auc: 0.903824
[8]	valid_0's auc: 0.905876
[9]	valid_0's auc: 0.910826
[10]	valid_0's auc: 0.915669
[11]	valid_0's auc: 0.917323
[12]	valid_0's auc: 0.918783
[13]	valid_0's auc: 0.920056
[14]	valid_0's auc: 0.921344
[15]	valid_0's auc: 0.922308
[16]	valid_0's auc: 0.924224
[17]	valid_0's auc: 0.925308
[18]	valid_0's auc: 0.927062
[19]	valid_0's auc: 0.928703
[20]	valid_0's auc: 0.929459
[21]	valid_0's auc: 0.93071
[22]	valid_0's auc: 0.931601
[23]	valid_0's auc: 0.932446
[24]	valid_0's auc: 0.933201
[25]	valid_0's auc: 0.934619
[26]	valid_0's auc: 0.936215
[27]	valid_0's auc: 0.937074
[28]	valid_0's auc: 0.938123
[29]	valid_0's auc: 0.939621
[30]	valid_0's auc: 0.940724
[31]	valid_0's auc

KeyboardInterrupt: 

In [167]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'lambda': 0.5
}

In [182]:
gbm = lgb.LGBMClassifier(metric="auc", num_leaves=31, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.5)

In [186]:
for label in pipe.class_labels:
    print(label)
    print("Fit")
    gbm.fit(pipe.train_features, train[label])
    print("Predict")
    pipe.predictions[label] = gbm.predict_proba(pipe.test_features)[:,1]

toxic
Fit
Predict
severe_toxic
Fit
Predict
obscene
Fit
Predict
threat
Fit
Predict
insult
Fit
Predict
identity_hate
Fit
Predict


In [187]:
pipe.cv_scores

{'Logistic regression newton': -1}

In [188]:
pipe.cv_scores["LightGBM"] = -1

In [189]:
pipe.models[0].name

'LightGBM'

In [195]:
submission = pipe.test[pipe.id_column].to_frame()
for label in pipe.class_labels:
    submission[label] = pipe.predictions[label]

filename = 'submissions\\submission23.csv'
submission.to_csv(filename, index=False)

In [193]:
pipe.predictions

{'identity_hate': array([9.55569199e-01, 2.39053048e-04, 1.78022777e-03, ...,
        3.31452501e-04, 2.11595090e-03, 4.51964093e-03]),
 'insult': array([0.94767719, 0.00432006, 0.01133886, ..., 0.00166285, 0.00209999,
        0.29476007]),
 'obscene': array([0.99431291, 0.00482181, 0.01335664, ..., 0.00308492, 0.00266341,
        0.22345165]),
 'severe_toxic': array([2.34768439e-01, 2.08802172e-04, 7.07685689e-03, ...,
        1.78179018e-04, 6.10784256e-04, 3.50254694e-03]),
 'threat': array([1.83904419e-01, 1.41904852e-04, 2.89499988e-04, ...,
        5.48113233e-05, 1.98012367e-04, 2.79884538e-04]),
 'toxic': array([0.99600831, 0.00662445, 0.03037946, ..., 0.00592981, 0.00517886,
        0.73509946])}

In [196]:
len(word_vector)

1999996

In [197]:
list_sentences_train = train["comment_text"].values

In [198]:
list_sentences_test = test["comment_text"].values

In [199]:
type(list_sentences_train)

numpy.ndarray

In [210]:
np.concatenate([list_sentences_train, list_sentences_test]).shape

(312735,)

In [213]:
type(word_vector)

dict

In [214]:
from sklearn.model_selection import KFold

In [251]:
folds = KFold(n_splits=5, shuffle=True, random_state=23)

In [252]:
for fold, fold2 in folds.split(train["id"]):
    print(fold2)

[     1      5     10 ... 159554 159557 159565]
[     6      8      9 ... 159547 159549 159559]
[     3     15     17 ... 159545 159552 159560]
[     2     11     18 ... 159563 159567 159568]
[     0      4      7 ... 159566 159569 159570]


In [261]:
indices = [idx for idx in folds.split(train["id"])]

In [266]:
def get_indices(fold):
    folds = KFold(n_splits=5, shuffle=True, random_state=23)
    indices = [idx for idx in folds.split(train["id"])]
    train_idx = indices[fold][0]
    pred_idx = indices[fold][1]
    return train_idx, pred_idx    

In [271]:
train_idx, pred_idx = get_indices(4)

In [273]:
embed_size = 300
max_features = 394787
maxlen = 100

list_sentences_train = train["comment_text"].values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].values

print("Tokenizing")
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(np.concatenate([list_sentences_train, list_sentences_test])))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

Tokenizing


In [275]:
embeddings_index = word_vector

In [276]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector 

In [282]:
class LstmNet2():
    
    def __init__(self, embed_size, max_features, maxlen, embedding_matrix):
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
        x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(6, activation="sigmoid")(x)
        self.model = Model(inputs=inp, outputs=x)
        # optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False)
        self.model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
    def fit(self, train_features, train_labels):
        early = [callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=0, verbose=0, mode='auto')]
        self.model.fit(train_features, train_labels, batch_size=32, epochs=2, callbacks=early)

    def predict_proba(self, features):
        self.predictions = self.model.predict([features], batch_size=1024, verbose=1)
        return self.predictions

    def submit(self):
        sub = pd.read_csv('data\\sample_submission.csv')
        sub[list_classes] = self.predictions
        sub.to_csv('submissions\\lstm5.csv', index=False)

In [283]:
fold = 0
train_idx, pred_idx = get_indices(fold)
net = LstmNet2(embed_size, max_features, maxlen, embedding_matrix)

In [286]:
net.fit(X_t[train_idx], y[train_idx])
y_test = net.predict_proba(X_te)

  "This may consume a large amount of memory." % num_elements)


Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[394787,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training_1/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_1/beta_2/read, training_1/Adam/Variable_11/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: metrics_1/acc/Mean_1/_335 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4629_metrics_1/acc/Mean_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'training_1/Adam/mul_3', defined at:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-286-7f674e174aaa>", line 1, in <module>
    net.fit(X_t[train_idx], y[train_idx])
  File "<ipython-input-282-133755313139>", line 17, in fit
    self.model.fit(train_features, train_labels, batch_size=32, epochs=2, callbacks=early)
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1646, in fit
    self._make_train_function()
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 970, in _make_train_function
    loss=self.total_loss)
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py", line 456, in get_updates
    v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 775, in _run_op
    return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 907, in binary_op_wrapper
    return func(x, y, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1131, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 3100, in _mul
    "Mul", x=x, y=y, name=name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[394787,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training_1/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_1/beta_2/read, training_1/Adam/Variable_11/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: metrics_1/acc/Mean_1/_335 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4629_metrics_1/acc/Mean_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
sub_oof = pd.read_csv('data\\sample_submission.csv')
sub_oof[list_classes] = y_test
sub_oof.to_csv('submission\\lstm_ft_oof' + fold + '.csv', index=False)

In [288]:
gc.collect()

NameError: name 'gc' is not defined

In [289]:
from keras import backend as K