In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train = pd.read_csv('train.csv').fillna(' ')

In [5]:
train_text = train['comment_text']

In [79]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=10000)

word_vectorizer.fit(train_text)
train_word_features = word_vectorizer.transform(train_text)

In [80]:
words = word_vectorizer.get_feature_names()

In [12]:
from collections import defaultdict

In [107]:
%%time
scores = []
sentiment = defaultdict(list)
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1)
    
    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_word_features, train_target)
    words_score = dict(zip(list(classifier.coef_[0]), list(words)))
    most_relavent = sorted(words_score.keys(), reverse=True)[:25]
    sentiment[class_name].extend([words_score[k] for k in most_relavent])

CV score for class toxic is 0.956936798640719
CV score for class severe_toxic is 0.9835714479411412
CV score for class obscene is 0.9798017272978011
CV score for class threat is 0.9759418970884361
CV score for class insult is 0.9689866422554662
CV score for class identity_hate is 0.9658976080748308
CPU times: user 47 s, sys: 2.07 s, total: 49.1 s
Wall time: 12.6 s


In [105]:
from sklearn.naive_bayes import GaussianNB

In [108]:
%%time
scores = []
sentiment = defaultdict(list)
for class_name in class_names:
    train_target = train[class_name]
    classifier = GaussianNB()
    
    cv_score = np.mean(cross_val_score(classifier, train_word_features.toarray(), train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_word_features, train_target)
    words_score = dict(zip(list(classifier.coef_[0]), list(words)))
    most_relavent = sorted(words_score.keys(), reverse=True)[:25]
    sentiment[class_name].extend([words_score[k] for k in most_relavent])

KeyboardInterrupt: 

In [97]:
train_target = train[class_name]

In [109]:
train_target

0         0
1         0
2         0
3         0
4         0
5         0
6         1
7         0
8         0
9         0
10        0
11        0
12        1
13        0
14        0
15        0
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
159541    1
159542    0
159543    0
159544    0
159545    0
159546    1
159547    0
159548    0
159549    0
159550    0
159551    0
159552    0
159553    0
159554    1
159555    0
159556    0
159557    0
159558    0
159559    0
159560    0
159561    0
159562    0
159563    0
159564    0
159565    0
159566    0
159567    0
159568    0
159569    0
159570    0
Name: toxic, Length: 159571, dtype: int64

In [102]:
classifier = LogisticRegression(C=0.1)
cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=4, scoring='roc_auc'))

In [103]:
cv_score

0.966473584917949

In [89]:
sentiment.keys()

dict_keys(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [90]:
1+1

2

In [83]:
print(sentiment['toxic'])

['fuck', 'fucking', 'shit', 'stupid', 'idiot', 'ass', 'suck', 'asshole', 'bitch', 'dick', 'hell', 'gay', 'faggot', 'crap', 'bullshit', 'cunt', 'pathetic', 'penis', 'sucks', 'hate', 'moron', 'shut', 'bastard', 'f', 'fag']


In [84]:
print(sentiment['severe_toxic'])

['fuck', 'fucking', 'bitch', 'suck', 'shit', 'ass', 'dick', 'asshole', 'cunt', 'faggot', 'u', 'cock', 'fuckin', 'die', 'fucker', 'f', 'motherfucker', 'nigger', 'gay', 'stupid', 'fuck fuck', 'bastard', 'suck dick', 'fat', 'mother']


In [85]:
print(sentiment['obscene'])

['fuck', 'fucking', 'shit', 'ass', 'bitch', 'asshole', 'suck', 'dick', 'cunt', 'faggot', 'stupid', 'bullshit', 'idiot', 'cock', 'bastard', 'f', 'penis', 'damn', 'sucks', 'fucker', 'crap', 'hell', 'fag', 'fucked', 'fuckin']


In [86]:
print(sentiment['insult'])

['fuck', 'fucking', 'idiot', 'stupid', 'bitch', 'asshole', 'ass', 'faggot', 'suck', 'shit', 'cunt', 'dick', 'bastard', 'moron', 'dumb', 'gay', 'fat', 'idiots', 'pathetic', 'loser', 'fag', 'nigger', 'f', 'cock', 'jerk']


In [88]:
print(sentiment['identity_hate'])

['gay', 'nigger', 'faggot', 'fuck', 'fucking', 'jew', 'shit', 'homosexual', 'nigga', 'ass', 'nazi', 'bitch', 'niggers', 'fag', 'jews', 'u', 'racist', 'hate', 'stupid', 'homo', 'suck', 'black', 'cunt', 'fat', 'muslim']
