In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, PReLU, Add, CuDNNGRU, CuDNNLSTM, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, BatchNormalization, SpatialDropout1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from attlayer import AttentionWeightedAverage
from nlp_pipeline import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
N_DIMS = 300

def get_coefs(row):
    row = row.strip().split()
    word, arr = " ".join(row[:-N_DIMS]), row[-N_DIMS:]
    return word, np.asarray(arr, dtype='float32')

def get_pretrained(text_file):
    return dict(get_coefs(row) for row in open(text_file, encoding="utf-8"))

def get_indices(fold):
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    indices = [idx for idx in folds.split(train["id"])]
    train_idx = indices[fold][0]
    pred_idx = indices[fold][1]
    return train_idx, pred_idx  

def halve(epoch):
    base = 0.002
    return base/(2**epoch)

def decay07(epoch):
    base = 0.001
    return base*(0.7**epoch)

In [12]:
class LstmNet():
    
    def __init__(self, embed_size, max_features, maxlen, embedding_matrix, num_features):
        input1 = Input(shape=(maxlen,))
        model1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(input1)
        # model1 = SpatialDropout1D(0.5)(model1)
        model1 = SpatialDropout1D(0.4)(model1)
        model1 = Bidirectional(GRU(300, return_sequences=True))(model1)
        # model1 = AttentionWeightedAverage()(model1)
        model1 = GlobalMaxPooling1D()(model1)

        out = Dense(6, activation="sigmoid")(model1)
        self.model = Model(inputs=input1, outputs=out)
        self.model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])
    
    def fit(self, train_features, train_labels):
        # early = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=0, verbose=0, mode='auto')
        # file_path="weights_base.best.hdf5"
        # checkpoint = callbacks.ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        lrate = LearningRateScheduler(decay07)
        # self.model.fit(train_features, y=train_labels, batch_size=32, epochs=3, validation_split=0.1, shuffle=True, callbacks=[lrate])
        self.model.fit(train_features, y=train_labels, batch_size=32, epochs=3, callbacks=[lrate])
        # self.model.fit(train_features, train_labels, batch_size=32, epochs=2)

    def predict_proba(self, X):
        self.predictions = self.model.predict(X, batch_size=1024, verbose=1)
        return self.predictions

    def submit(self):
        sub = pd.read_csv('data\\sample_submission.csv')
        sub[list_classes] = self.predictions
        sub.to_csv('submissions\\lstm22.csv', index=False)

In [7]:
def train_lstm(oof=True, fold=0):
    if oof:
        train_idx, pred_idx = get_indices(fold)
        net = LstmNet(embed_size, max_features, maxlen, embedding_matrix, num_features)
        net.fit(X_t[train_idx], y[train_idx])
        y_oof = net.predict_proba(X_t[pred_idx])
        
        sub_oof = pd.read_csv('submissions\\oof_trainlstm22.csv', encoding="utf-8")
        for i in range(0,len(list_classes)):
            sub_oof[list_classes[i]][pred_idx] = y_oof[:,i]
        sub_oof.to_csv('submissions\\oof_trainlstm22.csv', index=False, encoding="utf-8")
    
    else:
        net = LstmNet(embed_size, max_features, maxlen, embedding_matrix, num_features)
        net.fit(X_t, y)
        y_test = net.predict_proba(X_te)
        net.submit()

In [4]:
# pretrained = "data\\crawl-300d-2M.vec"
pretrained = 'data\\glove.840B.300d.txt'

embed_size = 300
# max_features = 394787
max_features = 200000
# maxlen = 500
maxlen = 150
num_features = 12

In [5]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [8]:
list_sentences_train = train["comment_text"].values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].values

print("Tokenizing")
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(np.concatenate([list_sentences_train, list_sentences_test])))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

print("Getting", pretrained)
embeddings_index = get_pretrained(pretrained)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Tokenizing
Getting data\glove.840B.300d.txt


In [9]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(metric="auc", max_depth=3, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.5, colsample_bytree=0.5, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM stacker"
models = [gbm]

pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_index=word_index, pretrained=pretrained)

In [10]:
pipe.engineer_features()

Engineering features


In [13]:
train_lstm(oof=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
train_lstm()

Epoch 1/3
Epoch 2/3
Epoch 3/3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
train_lstm(fold=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
train_lstm(fold=2)

In [None]:
train_lstm(fold=3)

In [None]:
train_lstm(fold=4)