In [1]:
#pandas
import pandas as pd
from pandas import Series,DataFrame

#numpy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from datetime import datetime
import math
from sklearn.preprocessing import LabelEncoder
#machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve,accuracy_score,mean_squared_log_error
from sklearn.metrics import roc_curve, auc,roc_auc_score,mean_squared_error
import xgboost as xgb
from numpy.linalg import inv
from xgboost.sklearn import XGBClassifier
from sklearn.kernel_ridge import KernelRidge
import matplotlib.dates
import pylab as p
#import matplotlib.axes3d as p3
import mpl_toolkits.mplot3d.axes3d as p3
import missingno as msno



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D,LSTM,GlobalMaxPool1D,Dropout,Dense,Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.preprocessing.text import Tokenizer

Using Theano backend.


In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [4]:
#new approach - using lemmetization first - kaggle kernel
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word,tag in pos_tag(word_tokenize(sentence)):
        if(tag.startswith("NN")):
            yield wnl.lemmatize(word,pos= 'n')
        elif(tag.startswith("VB")):
            yield wnl.lemmatize(word,pos = 'v')
        elif(tag.startswith("JJ")):
            yield wnl.lemmatize(word,pos = 'a')
        elif(tag.startswith("R")):
            yield wnl.lemmatize(word,pos = 'r')
        else:
            yield word
            

In [5]:
X_train = train["comment_text"].fillna("fillna").values
Y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test  = test["comment_text"].fillna("fillna").values

In [6]:
X_train1 =[]
X_test1 = []

In [7]:
print ("Train data lemmatization begins")
for i in range(0,len(train)):
    X_train1.append(" ".join(lemmatize_all(str(train['comment_text'][i]))))
print ("Train data lemmatization ends")
print ("Test data lemmatization begins")
for i in range (0, len(test)):
    X_test1.append(" ".join(lemmatize_all(str(test['comment_text'][i]))))
print ("Test data lemmatization ends")

Train data lemmatization begins
Train data lemmatization ends
Test data lemmatization begins
Test data lemmatization ends


In [8]:
max_features = 30000
maxlen = 100
embed_size = 300

In [9]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train1) + list(X_test1))
X_train = tokenizer.texts_to_sequences(X_train1)
X_test = tokenizer.texts_to_sequences(X_test1)
X_train = sequence.pad_sequences(X_train,maxlen=maxlen)
X_test = sequence.pad_sequences(X_test,maxlen=maxlen)

In [10]:
EMBEDDING_FILE = '../crawl-300d-2M.vec'

In [11]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [12]:

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE,encoding="utf8"))

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features,len(word_index))
embedding_matrix = np.zeros((nb_words,embed_size))

In [14]:
for word,i in word_index.items():
    if(i>=max_features):continue
    embedding_vector = embeddings_index.get(word)
    if(embedding_vector is not None):
        embedding_matrix[i] = embedding_vector

In [15]:
#important stuff to have an evaluation metric
class RocAucEvaluation(Callback):
    def __init__(self,validation_data = (),interval = 1):
        super(Callback,self).__init__()
        self.interval = interval
        self.X_val,self.y_val = validation_data
    def on_epoch_end(self,epoch,logs = {}):
        if(epoch%self.interval == 0):
            y_pred = self.model.predict(self.X_val,verbose = 0)
            score = roc_auc_score(self.y_val,y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [16]:
GlobalAveragePooling1D?

In [17]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128,dropout=0.1,recurrent_dropout=0.1, return_sequences=True))(x)
    x = Conv1D(64,kernel_size=3,padding="valid",kernel_initializer="glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer="Adam",
                  metrics=['accuracy'])

    return model

In [18]:
model = get_model()

  'RNN dropout is no longer supported with the Theano backend '


In [19]:
batch_size = 127
epochs = 4
X_tra,X_val,Y_tra,y_val = train_test_split(X_train,Y_train,train_size = 0.9,random_state = 233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)



In [20]:
hist = model.fit(X_tra, Y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])



Train on 143613 samples, validate on 15958 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.986699 

Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.986448 

Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.986118 

Epoch 4/4

 ROC-AUC - epoch: 4 - score: 0.985080 



In [21]:
y_test = model.predict(X_test,batch_size=1024,verbose=1)



In [22]:
hamse_na_ho_payega = test[['id']]

In [24]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [25]:
i = 0
for topic in list_classes:
    hamse_na_ho_payega[topic] = y_test[:,i]
    i+=1

In [26]:
hamse_na_ho_payega.to_csv('hamse_na_ho_payega.csv',index=False)