In [1]:
#Import packages
import time
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, make_scorer, roc_auc_score
start_time=time.time()

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#Read the training and testing data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Obtain the text and labels for training
train_text = train['comment_text']
test_text = test['comment_text']
#Create the corpus constructed from both the training text and testing text.
all_text = pd.concat([train_text, test_text])

#Use the word unigram as features.
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

#Use the character n-grams as features.
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

#Stack the features horizontally
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

#Define the range of the parameters to be tuned
tuned_parameters = {'penalty': ['l2'], 
                    'C': [0.01, 0.1, 1, 10, 100, 1000],
                    'solver': ['newton-cg', 'lbfgs', 'sag']}

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

end_time=time.time()
print("total time till check point 1",end_time-start_time)

total time till check point 1 944.5621819496155


In [2]:
#Train the models using GridSearchCV approach on each of the classes
for class_name in class_names:
    train_target = np.array(train[class_name])
    grid = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, scoring='roc_auc')
    grid.fit(train_features, train_target)
    
    print('Training LogisticRegression Classifier for {} is complete!!'.format(class_name))
    print(grid.best_estimator_)
    scores.append(grid.best_score_)
    print('CV score for class {} is {}'.format(class_name, grid.best_score_))
    submission[class_name] = grid.best_estimator_.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

end_time=time.time()
print("total time till check point 2",end_time-start_time)

submission.to_csv('submission.csv', index=False)

end_time=time.time()
print("total time till check point 3",end_time-start_time)



Training LogisticRegression Classifier for toxic is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class toxic is 0.979040432062354




Training LogisticRegression Classifier for severe_toxic is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class severe_toxic is 0.9883871292325412




Training LogisticRegression Classifier for obscene is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class obscene is 0.9904676188834843




Training LogisticRegression Classifier for threat is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class threat is 0.9903387676542358




Training LogisticRegression Classifier for insult is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class insult is 0.9829860024616485




Training LogisticRegression Classifier for identity_hate is complete!!
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
CV score for class identity_hate is 0.9828253664940856
Total CV score is 0.9856742194647249
total time till check point 2 50702.81738996506
total time till check point 3 50705.46449351311
