In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc

from scipy import sparse
import pickle


In [2]:
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')


In [3]:
testing_data.fillna(' ',inplace=True)

In [4]:
V_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
V_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(3,6),dtype=np.float32)

In [5]:
train_vec =V_word.fit_transform(training_data['comment_text'])
test_vec = V_word.transform(testing_data['comment_text'])
train_vec_ch = V_char.fit_transform(training_data['comment_text'])
test_vec_ch = V_char.transform(testing_data['comment_text'])

In [6]:
X = sparse.hstack([train_vec, train_vec_ch])
X_test = sparse.hstack([test_vec, test_vec_ch])

In [7]:
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = training_data[target_col]
del train_vec, test_vec, train_vec_ch, test_vec_ch

In [8]:
prd = np.zeros((X_test.shape[0],y.shape[1]))
scores=[]
for i,col in enumerate(target_col):
    lr = LogisticRegression(C=2,random_state = i,class_weight = 'balanced') 
    lr.fit(X,y[col])
    prd[:,i] = lr.predict_proba(X_test)[:,1]
    
    cv_score = np.mean(cross_val_score(
        lr, X, y[col], cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(col, cv_score))
    




CV score for class toxic is 0.975592585703
CV score for class severe_toxic is 0.984939778231
CV score for class obscene is 0.987940421369
CV score for class threat is 0.985969833657
CV score for class insult is 0.980048832962
CV score for class identity_hate is 0.978486019726


In [9]:
prd_1 = pd.DataFrame(prd,columns=y.columns)
submit = pd.concat([testing_data['Id'],prd_1],axis=1)
submit.to_csv('toxic_lr.csv',index=False)
submit.head()

Unnamed: 0,Id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,149643,0.382367,0.008437,0.005932,0.004701,0.075779,0.033236
1,47534,0.002829,0.000658,0.001691,0.000131,0.002312,0.00216
2,85727,0.10354,0.024833,0.066729,0.000263,0.087957,0.002677
3,113289,0.001294,0.000284,0.001299,0.000181,0.000546,6.6e-05
4,74735,0.042102,0.001592,0.001836,0.000736,0.002011,0.002183
