In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2,SelectKBest
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, roc_auc_score

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline

from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

In [2]:
from sklearn import metrics

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
# PREPROCESSING PART
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [6]:
def preprocess(arr,repl_dict):
    
    output = []

    lines = arr.tolist()
    
    #stopwords = stopwords.words('english')
    
    for line in lines:
        
        words = line.split()
        
        newline = ""
        
        for word in words:
            
            word = str(word).lower()
            
#             if word in stopwords:
#                 word = ""
            
#             else:
            if word[:4] == 'http' or word[:3] == 'www':
                continue

            if word in repl_dict:
                word = repl_dict[word]
                
            newline += word + " "
        
        output.append(newline)
        
    return output

In [7]:
def remove_not_alphabets(string):
    
    return re.sub("[^a-zA-Z ?!]+","",string)

In [8]:
train["new_comment_text"] = preprocess(train["comment_text"],repl)
train["new_comment_text"] = train["new_comment_text"].apply(remove_not_alphabets)

In [9]:
# def get_columns(s):

#     for col in train.columns:
#         if s[col]==1:
#             return col
        
# train['class'] = train.apply(get_columns,axis = 1)

In [10]:
np.random.seed(625)
X_train,X_test,Y_train,Y_test = train_test_split(train['new_comment_text'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']],test_size = 0.3)

In [11]:
pipeline = Pipeline([('vectorize',TfidfVectorizer(max_features=75000,ngram_range=(1,3),stop_words='english',sublinear_tf= True)),
                    ('best_feat',SelectKBest(chi2, k = 5000)),
                    ('clf',OneVsRestClassifier(
                        CalibratedClassifierCV(
                            LinearSVC(C=1.0, multi_class='ovr', penalty= 'l1', max_iter=3000,dual = False),cv=15)))])

In [12]:
model = pipeline.fit(X_train,Y_train)

In [13]:
Y_pred_prob = model.predict_proba(X_test)

In [14]:
print("Accuracy Score: " + str(model.score(X_test,Y_test)))

Accuracy Score: 0.91805230615


In [15]:
metrics.roc_auc_score(Y_test,Y_pred_prob)

0.96991485184487958

In [16]:
test = pd.read_csv('test.csv')

test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [17]:
test['new_comment_text'] = preprocess(test["comment_text"],repl)
test['new_comment_text'] = test['new_comment_text'].apply(remove_not_alphabets)

In [18]:
result = pipeline.predict_proba(test['new_comment_text'])

In [19]:
result = pd.DataFrame(result,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])
result.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.999126,0.07988,0.997531,0.002663,0.921082,0.02128
1,0.003319,0.00431,0.008715,0.000803,0.016013,0.003053
2,0.008722,0.004119,0.011336,0.000803,0.016005,0.003141
3,0.006755,0.001319,0.005367,0.000569,0.006814,0.000693
4,0.033195,0.002772,0.007094,0.000803,0.013362,0.004464


In [20]:
final = pd.concat([test['id'], result],axis = 1)

In [21]:
final.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999126,0.07988,0.997531,0.002663,0.921082,0.02128
1,0000247867823ef7,0.003319,0.00431,0.008715,0.000803,0.016013,0.003053
2,00013b17ad220c46,0.008722,0.004119,0.011336,0.000803,0.016005,0.003141
3,00017563c3f7919a,0.006755,0.001319,0.005367,0.000569,0.006814,0.000693
4,00017695ad8997eb,0.033195,0.002772,0.007094,0.000803,0.013362,0.004464


In [22]:
final.to_csv("submit6.csv",index=False)

In [23]:
# submit7 Score = 0.9728 // Rank = NS Max_feats = 75000, K-Best = 5000
# submit6 Score = 0.9706 // Rank = NS Max_feats = 75000, K-Best = 5000
# submit5 Score = 0.9750 // Rank = 1753 Max_feats = 75000, K-Best = 5000
# submit4 Score = 0.9714 // Rank = NA Max_feats = 75000, K-Best = 2000
# submit3 Score = 0.9733 // Rank = 1917 Max_feats = 75000, K-Best = 10000
# submit2 Score = 0.9710 // Rank = 2020 Max_feats = 50000, K-Best = 25000
# submit1 Last Score = 0.9696  // Rank = 2065

In [24]:
#Additional Steps

In [25]:
final_model = pipeline.fit(train['new_comment_text'],train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']])

In [26]:
final_results = final_model.predict_proba(test['new_comment_text'])
final_results = pd.DataFrame(final_results,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])

In [27]:
final = pd.concat([test['id'], final_results],axis = 1)
final.to_csv("submit7.csv",index=False)