In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import chi2,SelectKBest
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, roc_auc_score

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline

from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
# PREPROCESSING PART
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [6]:
def preprocess(arr,repl_dict):
    
    output = []

    lines = arr.tolist()
    
    stopwrds = stopwords.words('english')
    
    for line in lines:
        
        words = line.split()
        
        newline = ""
        
        for word in words:
            
            word = str(word).lower()
            
            if word in stopwrds:
                
                word = ""
            
            else:
                
                if word[:4] == 'http' or word[:3] == 'www':
                    continue

                if word in repl_dict:
                    word = repl_dict[word]
                
            newline += word + " "
        
        output.append(newline)
        
    return output

In [7]:
def remove_not_alphabets(string):
    
    return re.sub("[^a-zA-Z ?!]+","",string)

In [8]:
train["new_comment_text"] = preprocess(train["comment_text"],repl)
train["new_comment_text"] = train["new_comment_text"].apply(remove_not_alphabets)

In [9]:
test = pd.read_csv('test.csv')

test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [10]:
test['new_comment_text'] = preprocess(test["comment_text"],repl)
test['new_comment_text'] = test['new_comment_text'].apply(remove_not_alphabets)

In [None]:
# def get_columns(s):

#     for col in train.columns:
#         if s[col]==1:
#             return col
        
# train['class'] = train.apply(get_columns,axis = 1)

In [None]:
# from textblob import TextBlob

# zpolarity = {0:'zero',1:'one',2:'two',3:'three',4:'four',5:'five',6:'six',7:'seven',8:'eight',9:'nine',10:'ten'}
# zsign = {-1:'negative',  0.: 'neutral', 1:'positive'}

# train['polarity'] = train['new_comment_text'].map(lambda x: int(TextBlob(x).sentiment.polarity * 10))
# test['polarity'] = test['new_comment_text'].map(lambda x: int(TextBlob(x).sentiment.polarity * 10))

# train['new_comment_text'] = train.apply(lambda r: str(r['new_comment_text']) + ' polarity' +  zsign[np.sign(r['polarity'])] + zpolarity[np.abs(r['polarity'])], axis=1)
# test['new_comment_text'] = test.apply(lambda r: str(r['new_comment_text']) + ' polarity' +  zsign[np.sign(r['polarity'])] + zpolarity[np.abs(r['polarity'])], axis=1)

In [12]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)

train_word_features = word_vectorizer.fit_transform(train["new_comment_text"])
print('Word TFIDF 1/2')
test_word_features = word_vectorizer.transform(test["new_comment_text"])
print('Word TFIDF 2/2')

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
train_char_features = char_vectorizer.fit_transform(train["new_comment_text"])
print('Char TFIDF 1/2')
test_char_features = char_vectorizer.transform(test["new_comment_text"])
print('Char TFIDF 2/2')

Word TFIDF 1/2
Word TFIDF 2/2
Char TFIDF 1/2
Char TFIDF 2/2


NameError: name 'hstack' is not defined

In [17]:
from scipy.sparse import csr_matrix, hstack

train_features = hstack([train_char_features, train_word_features])
print('HStack 1/2')
test_features = hstack([test_char_features, test_word_features])
print('HStack 2/2')

HStack 1/2
HStack 2/2


In [18]:
np.random.seed(625)
X_train,X_test,Y_train,Y_test = train_test_split(train_features,train[['toxic', 'severe_toxic', 'obscene','threat','insult'
,'identity_hate']],test_size = 0.3)

In [30]:
pipeline = Pipeline([#('vectorize',TfidfVectorizer(max_features=75000,ngram_range=(1,3),stop_words='english',sublinear_tf= True)),
                    ('best_feat',SelectKBest(chi2, k = 10000)),
                    ('clf',OneVsRestClassifier(
                        CalibratedClassifierCV(
                            LinearSVC(C=1.0, multi_class='ovr', penalty= 'l1', max_iter=3000,dual = False),cv=15)))])

In [31]:
model = pipeline.fit(X_train,Y_train)

In [32]:
Y_pred_prob = model.predict_proba(X_test)

In [33]:
print("Accuracy Score: " + str(model.score(X_test,Y_test)))

Accuracy Score: 0.919827874332


In [34]:
metrics.roc_auc_score(Y_test,Y_pred_prob)

0.97644505723890529

In [None]:
#Start Prediction

In [35]:
result = pipeline.predict_proba(test_features)

In [36]:
result = pd.DataFrame(result,columns=['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate'])
result.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.999997,0.243474,0.999996,0.007066,0.991639,0.060182
1,0.003324,0.006103,0.007289,0.000612,0.016609,0.001746
2,0.01211,0.00246,0.013469,0.001071,0.021577,0.004189
3,0.006833,0.001434,0.005731,0.000811,0.003761,0.000516
4,0.02686,0.001074,0.00393,0.000795,0.00752,0.003052


In [37]:
final = pd.concat([test['id'], result],axis = 1)

In [38]:
final.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999997,0.243474,0.999996,0.007066,0.991639,0.060182
1,0000247867823ef7,0.003324,0.006103,0.007289,0.000612,0.016609,0.001746
2,00013b17ad220c46,0.01211,0.00246,0.013469,0.001071,0.021577,0.004189
3,00017563c3f7919a,0.006833,0.001434,0.005731,0.000811,0.003761,0.000516
4,00017695ad8997eb,0.02686,0.001074,0.00393,0.000795,0.00752,0.003052


In [39]:
final.to_csv("submit10.2.csv",index=False)

In [None]:
# submit10.2 Score = 0.9760 // Rank = 2007 Max_feats = 75000, K-Best = 10000+preprocessing+removeStpwords+chartfidf
# submit10.1 Score = 0.9760 // Rank = 2007 Max_feats = 75000, K-Best = 5000+preprocessing+removeStpwords+chartfidf
# submit8.2 Score = 0.9728 // Rank = NS Max_feats = 75000, K-Best = 5000+preprocessing+removeStpwords
# submit8.1 Score = 0.9704 // Rank = NS Max_feats = 75000, K-Best = 5000+preprocessing+removeStpwords
# submit7 Score = 0.9728 // Rank = NS Max_feats = 75000, K-Best = 5000+preprocessing
# submit6 Score = 0.9706 // Rank = NS Max_feats = 75000, K-Best = 5000+preprocessing
# submit5 Score = 0.9750 // Rank = 1753 Max_feats = 75000, K-Best = 5000
# submit4 Score = 0.9714 // Rank = NA Max_feats = 75000, K-Best = 2000
# submit3 Score = 0.9733 // Rank = 1917 Max_feats = 75000, K-Best = 10000
# submit2 Score = 0.9710 // Rank = 2020 Max_feats = 50000, K-Best = 25000
# submit1 Last Score = 0.9696  // Rank = 2065

In [None]:
#lightgbm

In [42]:
#logistics regression model

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [44]:
class_names = ['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']

In [61]:
scores=[]
logsubmit = pd.DataFrame.from_dict({'id': test['id']})

for class_name in class_names:
    
    train_target = Y_train[class_name]
    classifier = LogisticRegression(solver = 'sag')
    
    cv_score = np.mean(cross_val_score(classifier,X_train,train_target,cv=3,scoring='roc_auc'))
    scores.append(cv_score)
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(X_train, train_target)
    logsubmit[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

logsubmit.to_csv('Log_reg_submit1.csv', index=False)

CV score for class toxic is 0.9744912865364111
CV score for class severe_toxic is 0.988284413135033
CV score for class obscene is 0.9889499646636776
CV score for class threat is 0.9851726137854763
CV score for class insult is 0.9801525297305455
CV score for class identity_hate is 0.9813090191513857
Total CV score is 0.9830599711670881


In [62]:
logsubmit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999561,0.185367,0.999357,0.02119,0.979949,0.183551
1,0000247867823ef7,0.006316,0.003635,0.005686,0.001587,0.010635,0.003144
2,00013b17ad220c46,0.011331,0.001918,0.007878,0.000889,0.010867,0.002175
3,00017563c3f7919a,0.003751,0.002322,0.00288,0.000714,0.002232,0.000493
4,00017695ad8997eb,0.013589,0.001554,0.003797,0.001225,0.008421,0.002534
