- Day 44 : 21/03/30
- https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams

# Logistic regression with words and char n-grams

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic','severe_toxic','obscene','threat', 'insult', 'identity_hate']

In [3]:
train = pd.read_csv('../input/train.csv').fillna(' ')
test = pd.read_csv('../input/test.csv').fillna(' ')

In [4]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [7]:
train_text[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word',
                                 token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1,1), max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [10]:
char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char',
                                 stop_words='english', ngram_range=(2,6), max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



In [11]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [12]:
scores = []
submission = pd.DataFrame.from_dict({'id':test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    
    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:,1]

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9692181530018756
CV score for class severe_toxic is 0.9875919388189441
CV score for class obscene is 0.9838683748017404
CV score for class threat is 0.98337685453807
CV score for class insult is 0.9774237157135239
CV score for class identity_hate is 0.9739427492741198
Total CV score is 0.9792369643580456


In [13]:
submission.to_csv('submission.csv', index=False)