In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2,SelectKBest
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline

from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
stemmer = SnowballStemmer('english')
words = stopwords.words('english')

In [8]:
def cleanup(string):
    
    replaced_string = re.sub("[^a-zA-Z]"," ",string)
    for i in replaced_string.split():
        if i.lower() not in words:
            i = i #stemmer.stem(i)
    
    return ''.join(replaced_string.lower())

In [9]:
train['comment_text2'] = train['comment_text'].apply(cleanup)

In [10]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text2
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he matches this background colour i m s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not trying to edit war it...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestions on im...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember...


In [11]:
def get_columns(s):

    for col in train.columns:
        if s[col]==1:
            return col

In [12]:
train['class'] = train.apply(get_columns,axis = 1)

In [13]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text2,class
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he matches this background colour i m s...,
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not trying to edit war it...,
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestions on im...,
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember...,


In [14]:
np.random.seed(625)
X_train,X_test,Y_train,Y_test = train_test_split(train['comment_text2'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']],test_size = 0.3)

In [17]:
pipeline = Pipeline([('vectorize',TfidfVectorizer(max_features=50000,ngram_range=(1,3),stop_words='english',sublinear_tf= True)),
                    ('best_feat',SelectKBest(chi2, k = 40000)),
                    ('clf',OneVsRestClassifier(
                        CalibratedClassifierCV(
                            LinearSVC(C=1.0, multi_class='ovr', penalty= 'l1', max_iter=3000,dual = False),cv=15)))])

In [18]:
model = pipeline.fit(train['comment_text2'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']])

In [19]:
model = pipeline.fit(X_train,Y_train)

In [20]:
print("Accuracy Score: " + str(model.score(X_test,Y_test)))

Accuracy Score: 0.91715407754


In [21]:
test = pd.read_csv('test.csv')

test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [22]:
test['comment_text2'] = test['comment_text'].apply(cleanup)

In [23]:
test.head()

Unnamed: 0,id,comment_text,comment_text2
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule is more succesful then you ll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,from rfc the title is fine as it is ...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",sources zawe ashton on lapland...
3,00017563c3f7919a,":If you have a look back at the source, the in...",if you have a look back at the source the in...
4,00017695ad8997eb,I don't anonymously edit articles at all.,i don t anonymously edit articles at all


In [24]:
result = pipeline.predict_proba(test['comment_text2'])

In [25]:
result

array([[  9.99941921e-01,   1.48537190e-01,   9.99830534e-01,
          6.12239146e-03,   9.27148615e-01,   3.41050316e-02],
       [  3.09080361e-03,   3.87677321e-03,   7.84346050e-03,
          1.23600947e-03,   1.32239063e-02,   2.64121160e-03],
       [  1.35458913e-02,   3.76618574e-03,   8.70277174e-03,
          6.72049377e-04,   1.45495081e-02,   2.77449814e-03],
       ..., 
       [  5.18914337e-03,   1.80873690e-03,   5.11085794e-03,
          5.26487634e-04,   5.14381610e-03,   2.60580378e-04],
       [  5.88596897e-03,   2.73640688e-03,   7.29682800e-03,
          1.45269546e-03,   1.04253405e-02,   5.71806511e-03],
       [  9.97682361e-01,   2.43657918e-03,   9.27006537e-01,
          6.61948778e-04,   5.25875875e-01,   1.01689117e-02]])

In [26]:
sub = test.copy() 

In [27]:
sub.drop(['comment_text','comment_text2'],inplace = True,axis = 1)

In [28]:
sub.head()

Unnamed: 0,id
0,00001cee341fdb12
1,0000247867823ef7
2,00013b17ad220c46
3,00017563c3f7919a
4,00017695ad8997eb


In [29]:
submit = pd.DataFrame(result,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])

In [30]:
submit.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.999942,0.148537,0.999831,0.006122,0.927149,0.034105
1,0.003091,0.003877,0.007843,0.001236,0.013224,0.002641
2,0.013546,0.003766,0.008703,0.000672,0.01455,0.002774
3,0.005956,0.002693,0.007859,0.001154,0.005657,0.000203
4,0.014909,0.001289,0.005992,0.000903,0.010742,0.003349


In [42]:
final = pd.concat([sub, submit],axis = 1)

In [44]:
final.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999942,0.148537,0.999831,0.006122,0.927149,0.034105
1,0000247867823ef7,0.003091,0.003877,0.007843,0.001236,0.013224,0.002641
2,00013b17ad220c46,0.013546,0.003766,0.008703,0.000672,0.01455,0.002774
3,00017563c3f7919a,0.005956,0.002693,0.007859,0.001154,0.005657,0.000203
4,00017695ad8997eb,0.014909,0.001289,0.005992,0.000903,0.010742,0.003349


In [45]:
final.to_csv("submit1.csv",index=False)

In [None]:
# Score = 0.9696  // Rank = 2065