In [23]:
TRAIN_FP = 'train.csv'
TEST_FP = 'test.csv'
MAX_NUM_WORDS = 1500

import keras
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re

train_data, test_data = pd.read_csv(TRAIN_FP), pd.read_csv(TEST_FP)
'''
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text
train_data['comment_text'] = train_data['comment_text'].map(lambda com : clean_text(com))
'''
texts = train_data.values[:,1]
labels = np.array(train_data.values[:,2:], dtype = int)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from time import time

num_samples = len(texts)
tfidf_vectorizer = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
t0 = time()
train_tfidf_data = tfidf_vectorizer.fit_transform(texts[0:int(num_samples*0.7)])
test_tfidf_data = tfidf_vectorizer.transform(texts[int(num_samples*0.7):])
print("done in %0.3fs." % (time() - t0))

done in 58.797s.


In [12]:
tf_vectorizer = CountVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
t0 = time()
train_tf_data = tf_vectorizer.fit_transform(texts[0:int(num_samples*0.7)])
test_tf_data = tf_vectorizer.transform(texts[int(num_samples*0.7):])
print("done in %0.3fs." % (time() - t0))

done in 62.019s.


## Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

train_label = labels[0:int(num_samples*0.7)]
test_label = labels[int(num_samples*0.7):]

roc_pred = np.zeros((test_tfidf_data.shape[0],6))
y_pred = np.zeros((test_tfidf_data.shape[0],6))
parameters = {'C':[.01, .1, 1, 10, 100, 1000]}

for i in range(6):
    lr = LogisticRegression(class_weight = 'balanced')
    clf = GridSearchCV(lr, parameters)
    clf.fit(train_tfidf_data, train_label[:,i])
    # compute the training accuracy
    lr = LogisticRegression(C = clf.best_params_['C'],class_weight = 'balanced')
    lr.fit(train_tfidf_data, train_label[:,i])
    y_pred[:,i] = lr.predict(test_tfidf_data)
    print 'accuracy for label'+str(i)+'is:', accuracy_score(test_label[:,i], y_pred[:,i])
    roc_pred[:,i] = lr.predict_proba(test_tfidf_data)[:,1]

def compute_acc(y_pred, y_val):
    result = 0.0
    num_sample = y_pred.shape[0]
    for i in range(num_sample):
        if np.array_equal(y_pred[i], y_val[i]):
            result = result + 1
    return result/num_sample
        
    

print 'accuracy:',compute_acc(y_pred, test_label)
print 'overall Roc score is :', roc_auc_score(test_label, roc_pred)

 accuracy for label0is: 0.937061330214
accuracy for label1is: 0.979612299465
accuracy for label2is: 0.97347092246
accuracy for label3is: 0.995947526738
accuracy for label4is: 0.961564171123
accuracy for label5is: 0.980656751337
accuracy: 0.878425802139
overall Roc score is : 0.962313205572


## Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

roc_pred = np.zeros((test_tfidf_data.shape[0],6))
y_pred = np.zeros((test_tfidf_data.shape[0],6))
parameters = {'C':[.01, .1, 1, 10, 100, 1000]}
train_label = labels[0:int(num_samples*0.7)]
test_label = labels[int(num_samples*0.7):]


for i in range(6):

    clf = RandomForestClassifier(
            max_depth=100, max_features=1000,
            min_samples_leaf=3, min_samples_split=10,
            n_estimators=10)
    
    clf.fit(train_tfidf_data, train_label[:,i])
    # compute the training accuracy
    #svc = SVC(C = clf.best_params_['C'], class_weight = 'balanced')
    #svc.fit(train_tfidf_data, train_label[:,i])
    y_pred[:,i] = clf.predict(test_tfidf_data)
    print 'accuracy for label'+str(i)+'is:', accuracy_score(test_label[:,i], y_pred[:,i])
    roc_pred[:,i] = clf.predict_proba(test_tfidf_data)[:,1]

print 'accuracy:',compute_acc(y_pred, test_label)
print 'overall Roc score is :', roc_auc_score(test_label, roc_pred)

accuracy for label0is: 0.952811664439
accuracy for label1is: 0.989847927807
accuracy for label2is: 0.980092747326
accuracy for label3is: 0.997284425134
accuracy for label4is: 0.969355782086
accuracy for label5is: 0.991477272727
accuracy: 0.914835394385
overall Roc score is : 0.934777300828


## LDA

* _**Using Term-Frequency Feature**_

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=2, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
texts = train_data.values[:,1]
labels = np.asarray(train_data.values[:,2],dtype='int')
train_label = labels[0:int(num_samples*0.7)]
test_label = labels[int(num_samples*0.7):]

t0 = time()
i = 0
while True:
    if i*2000+2000 > (num_samples*0.7):
        lda.partial_fit(train_tf_data[i*2000:])
        result = lda.transform(test_tf_data[0:1000])
        print accuracy_score(test_label[0:1000], np.asarray(result[:,0]<result[:,1],dtype = 'int'))
        break
    lda.partial_fit(train_tf_data[i*2000:i*2000+2000])
    result = lda.transform(test_tf_data[0:1000])
    #print accuracy_score(test_label[0:1000], np.asarray(result[:,0]<result[:,1],dtype = 'int'))
    i = i + 1
print("done in %0.3fs." % (time() - t0))

0.862
done in 19.169s.


In [14]:
result = lda.transform(test_tf_data)
print accuracy_score(test_label, np.asarray(result[:,0]<result[:,1],dtype = 'int'))

0.860544786096


* _**Using Term-Frequency-IDF Feature**_

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=2, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
texts = train_data.values[:,1]
labels = np.asarray(train_data.values[:,2],dtype='int')
train_label = labels[0:int(num_samples*0.7)]
test_label = labels[int(num_samples*0.7):]

t0 = time()
i = 0
while True:
    if i*2000+2000 > (num_samples*0.7):
        lda.partial_fit(train_tfidf_data[i*2000:])
        result = lda.transform(test_tfidf_data[0:1000])
        print 1 - accuracy_score(test_label[0:1000], np.asarray(result[:,0]>result[:,1],dtype = 'int'))
        break
    lda.partial_fit(train_tfidf_data[i*2000:i*2000+2000])
    result = 1 - lda.transform(test_tfidf_data[0:1000])
    #print accuracy_score(test_label[0:1000], np.asarray(result[:,0]>result[:,1],dtype = 'int'))
    i = i + 1
print("done in %0.3fs." % (time() - t0))

0.573
done in 17.600s.


In [16]:
result = lda.transform(test_tf_data)
print accuracy_score(test_label, np.asarray(result[:,0]<result[:,1],dtype = 'int'))

0.540232286096


## XGBoost

In [32]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

roc_pred = np.zeros((test_tfidf_data.shape[0],6))
y_pred = np.zeros((test_tfidf_data.shape[0],6))
parameters = {'C':[.01, .1, 1, 10, 100, 1000]}
train_label = labels[0:int(num_samples*0.7)]
test_label = labels[int(num_samples*0.7):]

for i in range(6):
    xgb_params = {'eta': 0.3, 
                  'max_depth': 5, 
                  'subsample': 0.8, 
                  'colsample_bytree': 0.8, 
                  'objective': 'binary:logistic', 
                  'eval_metric': 'auc', 
                  'seed': 23
                 }

    d_train = xgb.DMatrix(train_tfidf_data, train_label[:,i])
    d_valid = xgb.DMatrix(test_tfidf_data, test_label[:,i])

    watchlist = [(d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 200, watchlist, verbose_eval=False, early_stopping_rounds=30)
    roc_pred[:,i] = model.predict(d_valid)

print 'Overall accuracy:',compute_acc(roc_pred>0.5, test_label)
print 'Overall Roc score is :', roc_auc_score(test_label, roc_pred)

Overall accuracy: 0.919222092246
overall Roc score is : 0.966978861974
