In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.svm import LinearSVC
from collections import Counter
from sklearn.svm import SVC
from sklearn import svm
from sklearn.cross_validation import cross_val_predict

from sklearn import cross_validation
from sklearn.model_selection import cross_val_score



# Read 100000 random sample

In [3]:
data = pd.read_csv('video_products_100000_sample.csv',
                   encoding='utf-8',  
                    engine='python',
                   memory_map=True,
                   error_bad_lines=False,
                    na_values='NaN')

In [None]:
data.head()

In [33]:
X=data['review_body'].values

In [34]:
y=data['star_rating']

# TfidfVectorizer: ngram_range=(1,2),stop_words='english'

In [35]:
#TfidfVectorizer
t1 = datetime.now()
vect=TfidfVectorizer(ngram_range=(1,2),stop_words='english')
train_dtm = vect.fit_transform(X)
print(train_dtm.shape)
print(datetime.now() - t1)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(100000, 3429605)
0:00:42.242206


# Split the original data sample

In [36]:
# Split the original data sample
X_train, X_test, y_train, y_test = train_test_split(train_dtm, y ,
                                                    test_size=0.3, random_state=100)

In [37]:
X_train

<70000x3429605 sparse matrix of type '<class 'numpy.float64'>'
	with 7685159 stored elements in Compressed Sparse Row format>

# Resampling on training data

In [38]:
# Perform SMOTE on training set only
t1 = datetime.now()
print('Original dataset shape {}'.format(Counter(y_train)))
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_sample(X_train, y_train) 
print('Resampled dataset shape {}'.format(Counter(y_res)))
print(datetime.now() - t1)

Original dataset shape Counter({5: 41824, 4: 13372, 3: 6452, 1: 4959, 2: 3393})
Resampled dataset shape Counter({5: 41824, 2: 41824, 1: 41824, 4: 41824, 3: 41824})
0:01:29.649104


# Test on original imbalanced data sample

# Test on balanced data sample

# 10-fold cross validation on resample data

In [42]:
# Naive Bayes 10-fold cross validation 
# balanced
t1 = datetime.now()
nb = MultinomialNB()
nb = nb.fit(X_res, y_res)
y_pred_class = cross_validation.cross_val_predict(nb,X_test,y_test,cv=10)

print("10-fold cross validation average accuracy: %.3f" %accuracy_score(y_test,y_pred_class))
print(classification_report(y_test,y_pred_class))
print(confusion_matrix(y_test,y_pred_class))

print(datetime.now() - t1)

10-fold cross validation average accuracy: 0.596
             precision    recall  f1-score   support

          1       0.00      0.00      0.00      2157
          2       0.00      0.00      0.00      1487
          3       0.00      0.00      0.00      2723
          4       0.00      0.00      0.00      5739
          5       0.60      1.00      0.75     17894

avg / total       0.36      0.60      0.45     30000

[[    0     0     0     0  2157]
 [    0     0     0     0  1487]
 [    0     0     0     0  2723]
 [    0     0     0     0  5739]
 [    0     0     0     0 17894]]
0:00:11.794410


  'precision', 'predicted', average, warn_for)


In [40]:
#LinearSVC 10-fold cross validation 
# balanced
t1 = datetime.now()
lin_clf = LinearSVC()
clf=lin_clf.fit(X_res, y_res)
y_pred_class = cross_validation.cross_val_predict(clf,X_test,y_test,cv=10)

print("10-fold cross validation average accuracy: %.3f" %accuracy_score(y_test,y_pred_class))
print(classification_report(y_test,y_pred_class))
print(confusion_matrix(y_test,y_pred_class))

print(datetime.now() - t1)

10-fold cross validation average accuracy: 0.638
             precision    recall  f1-score   support

          1       0.67      0.42      0.51      2157
          2       0.40      0.03      0.06      1487
          3       0.36      0.06      0.11      2723
          4       0.35      0.12      0.18      5739
          5       0.67      0.97      0.79     17894

avg / total       0.56      0.64      0.55     30000

[[  900    39    58   116  1044]
 [  237    46   115   220   869]
 [   99    22   176   522  1904]
 [   39     2   101   710  4887]
 [   68     6    45   474 17301]]
0:03:18.053622


In [41]:
#LogisticRegression 10-fold cross validation 
# balanced
t1 = datetime.now()
logreg = LogisticRegression()

logreg = logreg.fit(X_res, y_res)

y_pred_class = cross_validation.cross_val_predict(logreg,X_test,y_test,cv=10)

print("10-fold cross validation average accuracy: %.3f" %accuracy_score(y_test,y_pred_class))
print(classification_report(y_test,y_pred_class))
print(confusion_matrix(y_test,y_pred_class))

print(datetime.now() - t1)

10-fold cross validation average accuracy: 0.611
             precision    recall  f1-score   support

          1       0.80      0.15      0.25      2157
          2       0.00      0.00      0.00      1487
          3       0.36      0.01      0.02      2723
          4       0.32      0.04      0.07      5739
          5       0.62      0.99      0.76     17894

avg / total       0.52      0.61      0.49     30000

[[  327     0     9    45  1776]
 [   48     0    13   109  1317]
 [   18     0    21   189  2495]
 [    6     0     7   219  5507]
 [   11     0     8   116 17759]]
0:09:34.857134


  'precision', 'predicted', average, warn_for)
