In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2,SelectKBest
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline

from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

In [2]:
from sklearn import metrics

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
stemmer = SnowballStemmer('english')
words = stopwords.words('english')

In [6]:
a = ["i'd","we'd","i'm","i ain't"]

arr = re.sub(r'\'d')',"would",a)

SyntaxError: EOL while scanning string literal (<ipython-input-6-ba07a332bab6>, line 3)

In [7]:
def cleanup(string):
    
    replaced_string = re.sub("[^a-zA-Z]"," ",string)
    for i in replaced_string.split():
        if i.lower() not in words:
            i = i #stemmer.stem(i)
    
    return ''.join(replaced_string.lower())

In [8]:
train['comment_text2'] = train['comment_text'].apply(cleanup)

In [9]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text2
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he matches this background colour i m s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not trying to edit war it...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestions on im...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember...


In [10]:
def get_columns(s):

    for col in train.columns:
        if s[col]==1:
            return col

In [11]:
train['class'] = train.apply(get_columns,axis = 1)

In [12]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text2,class
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he matches this background colour i m s...,
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not trying to edit war it...,
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestions on im...,
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember...,


In [13]:
np.random.seed(625)
X_train,X_test,Y_train,Y_test = train_test_split(train['comment_text2'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']],test_size = 0.3)

In [14]:
pipeline = Pipeline([('vectorize',TfidfVectorizer(max_features=75000,ngram_range=(1,3),stop_words='english',sublinear_tf= True)),
                    ('best_feat',SelectKBest(chi2, k = 5000)),
                    ('clf',OneVsRestClassifier(
                        CalibratedClassifierCV(
                            LinearSVC(C=1.0, multi_class='ovr', penalty= 'l1', max_iter=3000,dual = False),cv=15)))])

In [15]:
model = pipeline.fit(train['comment_text2'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']])

In [None]:
# model = pipeline.fit(X_train,Y_train)

In [16]:
Y_pred_prob = model.predict_proba(X_test)

In [17]:
print("Accuracy Score: " + str(model.score(X_test,Y_test)))

Accuracy Score: 0.9296875


In [18]:
metrics.roc_auc_score(Y_test,Y_pred_prob)

0.98919684158633781

In [19]:
test = pd.read_csv('test.csv')

test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
test['comment_text2'] = test['comment_text'].apply(cleanup)

In [None]:
test.head()

In [None]:
result = pipeline.predict_proba(test['comment_text2'])

In [None]:
result

In [None]:
sub = test.copy() 

In [None]:
sub.drop(['comment_text','comment_text2'],inplace = True,axis = 1)

In [None]:
submit = pd.DataFrame(result,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])

In [None]:
submit.head()

In [None]:
final = pd.concat([sub, submit],axis = 1)

In [None]:
final.head()

In [None]:
final.to_csv("submit5.001.csv",index=False)

In [None]:
# submit5 Score = 0.9750 // Rank = 1753 Max_feats = 75000, K-Best = 5000
# submit4 Score = 0.9714 // Rank = NA Max_feats = 75000, K-Best = 2000
# submit3 Score = 0.9733 // Rank = 1917 Max_feats = 75000, K-Best = 10000
# submit2 Score = 0.9710 // Rank = 2020 Max_feats = 50000, K-Best = 25000
# submit1 Last Score = 0.9696  // Rank = 2065

In [65]:
#Additional Steps

In [103]:
features = pipeline.named_steps.vectorize.get_feature_names()

len(features)

75000

In [104]:
features

['aa',
 'aaa',
 'aah',
 'aaliyah',
 'aap',
 'aardvark',
 'aardvark books',
 'aaron',
 'aave',
 'ab',
 'aba',
 'abad',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abbas',
 'abbey',
 'abbott',
 'abbottabad',
 'abbrev',
 'abbreviated',
 'abbreviation',
 'abbreviations',
 'abc',
 'abc news',
 'abcnews',
 'abcnews com',
 'abd',
 'abducted',
 'abduction',
 'abdul',
 'abdullah',
 'abe',
 'abel',
 'abhira',
 'abhiras',
 'abhiras foreigners',
 'abhishek',
 'abhor',
 'abhorrent',
 'abi',
 'abide',
 'abide agreements',
 'abide agreements reached',
 'abide rules',
 'abide wikipedia',
 'abiding',
 'abilities',
 'ability',
 'ability create',
 'ability create articles',
 'ability customize',
 'ability customize appearance',
 'ability detect',
 'ability edit',
 'ability rename',
 'ability rename pages',
 'ability start',
 'ability upload',
 'ability upload images',
 'ability view',
 'ability view contributions',
 'abiogenesis',
 'abit',
 'abject',
 'abkhazia',
 'abkhazia south',
 'abkha

In [139]:
final_pipeline = Pipeline([('vectorize',TfidfVectorizer(max_features=75000,ngram_range=(1,3),stop_words='english',sublinear_tf= True)),
                    ('best_feat',SelectKBest(chi2, k = 5000)),
                    ('clf',OneVsRestClassifier(
                            LinearSVC(C=1.0, multi_class='ovr', penalty= 'l1', max_iter=3000,dual = False)))])

In [140]:
final_model = final_pipeline.fit(train['comment_text2'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']])

In [None]:
result2 = pipeline.predict_proba(test['comment_text2'])

In [None]:
submit2 = pd.DataFrame(result2,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])
final_final = pd.concat([sub, submit2],axis = 1)
final_final.to_csv("submit5final.csv",index=False)