In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re, string

Using TensorFlow backend.


# Begin with analysing the data

In [2]:
train = pd.read_csv('./train.csv')
train = train.iloc[0:50000]

In [56]:
train['comment_text'][192]

'"\n, editors don\'t care about your ""explanations"" if they\'re not accompanied by reliable published sources. I could ""explain"" why I thought dogs ate cats but I would get the same reception if I didn\'t provide sources. Stop edit warring and present sources.  talk to me "'

In [26]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# We will first try to use Naive Bayes to analyse

we can see that our train data is the comments with several labels, which is from wikipedia

In [35]:
#building the model with a bag of word and ngram as mentioned in the paper
re_tok = re.compile('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
train['none'] = 1-train[label_cols].max(axis=1)

In [83]:
def Naive_Bayes(test_path, result):
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    n = train.shape[0]
    test = pd.read_csv(test_path).fillna(' ')
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1 )
    trn_term_doc = vec.fit_transform(train['comment_text'])
    test_term_doc = vec.transform(test['Comment'])
    x = trn_term_doc
    test_x = test_term_doc
    preds = np.zeros((len(test), len(label_cols)))

    for i, j in enumerate(label_cols):
        print('fit', j)
        m,r = get_mdl(train[j], x)
        preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
        
    subm = pd.read_csv('./submission.csv')
    submid = pd.DataFrame({'id': test["Comment ID"]})
    subcommet = pd.DataFrame({'comment': test["Comment"]})
    submission = pd.concat([submid, subcommet, pd.DataFrame(preds, columns = label_cols)], axis=1)
    submission.to_csv(result, index=False)

In [84]:
#Here's the basic naive bayes feature equation:
def pr(y_i, y, x):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

#Fit a model for one dependent at a time:
def get_mdl(y, x):
    y = y.values
    r = np.log(pr(1,y, x) / pr(0,y, x))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [85]:
Naive_Bayes('./input/politic/altRightSubredditBannedComments.csv', 'altRightNB.csv')

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [86]:
Naive_Bayes('./input/politic/abortionComments.csv', 'abortionCommentsNB.csv')

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [87]:
Naive_Bayes('./input/politic/donaldTrumpElectionWinComments.csv', 'DTLRNB.csv')

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [82]:
Naive_Bayes('./input/politic/gunControlComments.csv', 'gunCNB.csv')

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


# Then we can try logic regression

In [95]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#linear regression model training
def linear_regression_train_model(test_path, result):
    train = pd.read_csv('./train.csv').fillna(' ')
    test = pd.read_csv(test_path).fillna(' ')
    train = train.iloc[0:50000]
    train_text = train['comment_text']
    test_text = test['Comment']
    all_text = pd.concat([train_text, test_text])
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        stop_words='english',
        ngram_range=(1, 1),
        max_features=10000)
    word_vectorizer.fit(all_text)
    train_word_features = word_vectorizer.transform(train_text)
    test_word_features = word_vectorizer.transform(test_text)

    char_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='char',
        stop_words='english',
        ngram_range=(2, 6),
        max_features=50000)
    char_vectorizer.fit(all_text)
    train_char_features = char_vectorizer.transform(train_text)
    test_char_features = char_vectorizer.transform(test_text)

    train_features = hstack([train_char_features, train_word_features])
    test_features = hstack([test_char_features, test_word_features])

    scores = []
    submission = pd.DataFrame.from_dict({'id': test['Username']})
    for class_name in class_names:
        train_target = train[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')

        cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print('CV score for class {} is {}'.format(class_name, cv_score))
        classifier.fit(train_features, train_target)
        submission[class_name] = classifier.predict_proba(test_features)[:, 1]

    print('Total CV score is {}'.format(np.mean(scores)))
    
    subcommet = pd.DataFrame({'comment': test["Comment"]})
    submission = pd.concat([subcommet, submission], axis=1)

    submission.to_csv(result, index=False)

In [9]:
linear_regression_train_model('./input/altRightSubredditBannedComments.csv', 'altRightLR.csv')

CV score for class toxic is 0.9596636113784185
CV score for class severe_toxic is 0.9859851084008343
CV score for class obscene is 0.977594990154138
CV score for class threat is 0.9832288967021358
CV score for class insult is 0.971418750360742
CV score for class identity_hate is 0.9650141645863023
Total CV score is 0.9738175869304285


In [10]:
linear_regression_train_model('./input/abortionComments.csv', 'abbortionCommentsLR.csv')

CV score for class toxic is 0.9597384235909963
CV score for class severe_toxic is 0.9859677900792415
CV score for class obscene is 0.9776991351208171
CV score for class threat is 0.9831342936375068
CV score for class insult is 0.9714706555451095
CV score for class identity_hate is 0.9652301024325013
Total CV score is 0.9738734000676955


In [102]:
linear_regression_train_model('./input/politic/donaldTrumpElectionWinComments.csv', 'DTLR.csv')

CV score for class toxic is 0.9588854163155663
CV score for class severe_toxic is 0.9860032958677424
CV score for class obscene is 0.9767734828122561
CV score for class threat is 0.9836402748118425
CV score for class insult is 0.9709402822376224
CV score for class identity_hate is 0.9647623193377864
Total CV score is 0.9735008452304695


In [12]:
linear_regression_train_model('./input/gunControlComments.csv', 'gunCLR.csv')

CV score for class toxic is 0.959762972885201
CV score for class severe_toxic is 0.985969822436429
CV score for class obscene is 0.9777064440396698
CV score for class threat is 0.9831557940417873
CV score for class insult is 0.97148863653939
CV score for class identity_hate is 0.9652188230524432
Total CV score is 0.9738837488324866


# LSTM

In [3]:
max_features = 20000
maxlen = 100
train = train.sample(frac=1)

def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [4]:
model = get_model()
batch_size = 32
epochs = 2
def LSTM_training(test_path, result):
    test = pd.read_csv(test_path).fillna('NAN')
    list_sentences_train = train["comment_text"].fillna("CVxTz").values
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    y = train[list_classes].values
    list_sentences_test = test["Comment"].fillna("CVxTz").values


    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(list_sentences_train))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
    file_path="weights_base.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


    callbacks_list = [checkpoint, early] 
    model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

    model.load_weights(file_path)

    y_test = model.predict(X_te)
    submid = pd.DataFrame({'id': test["Username"]})
    subcommet = pd.DataFrame({'comment': test["Comment"]})
    submission = pd.concat([submid, subcommet, pd.DataFrame(y_test, columns = list_classes)], axis=1)
    submission.to_csv(result, index=False)

In [6]:
LSTM_training('./input/politic/altRightSubredditBannedComments.csv', 'altRightLSTM.csv')

Train on 45000 samples, validate on 5000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05953, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.05953 to 0.05851, saving model to weights_base.best.hdf5


In [103]:
LSTM_training('./input/politic/abortionComments.csv', 'abbortionCommentsLSTM.csv')

Train on 18000 samples, validate on 2000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.06141, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss did not improve


In [94]:
LSTM_training('./input/politic/donaldTrumpElectionWinComments.csv', 'DTwinLSTM.csv')

Train on 18000 samples, validate on 2000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.06305, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.06305 to 0.05970, saving model to weights_base.best.hdf5


In [104]:
LSTM_training('./input/politic/gunControlComments.csv', 'gunControlLSTM.csv')

Train on 18000 samples, validate on 2000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.06291, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss did not improve


In [105]:
LSTM_training('./input/politic/marijuanaJustlikecigar.csv', 'marijuanaJustlikecigarLSTM.csv')

Train on 18000 samples, validate on 2000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.07331, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.07331 to 0.06791, saving model to weights_base.best.hdf5


In [106]:
LSTM_training('./input/politic/ObamabetterthanAllRep.csv', 'ObamabetterthanAllRepLSTM.csv')

Train on 18000 samples, validate on 2000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.07006, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss did not improve


In [6]:
LSTM_training('./input/sports/ItalyFailWC.csv', 'ItalyFailWCLSTM.csv')

Train on 45000 samples, validate on 5000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05333, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.05333 to 0.04997, saving model to weights_base.best.hdf5


In [7]:
LSTM_training('./input/sports/RomareverseBarca.csv', 'RomareverseBarcaLSTM.csv')

Train on 45000 samples, validate on 5000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05194, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss did not improve


In [8]:
LSTM_training('./input/sports/RonaldotoMonaco.csv', 'RonaldotoMonacoLSTM.csv')

Train on 45000 samples, validate on 5000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05762, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.05762 to 0.05570, saving model to weights_base.best.hdf5
