In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

import pandas
dataset = pandas.read_csv('data/yelp.csv')
dataset.describe()


Unnamed: 0,stars,cool,useful,funny
count,10000.0,10000.0,10000.0,10000.0
mean,3.7775,0.8768,1.4093,0.7013
std,1.214636,2.067861,2.336647,1.907942
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,1.0,0.0
75%,5.0,1.0,2.0,1.0
max,5.0,77.0,76.0,57.0


In [2]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split (dataset.text, dataset.stars, random_state=1)



In [3]:
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#cvect = CountVectorizer(stop_words = 'english', max_features = 2000)
#Tfid works better with SVM than NB
cvect = TfidfVectorizer(stop_words = 'english', max_features = 2000)
cvect.fit(X_train)
X = cvect.transform(X_train)
X.shape
y = y_train

nb = MultinomialNB()
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [4]:
h = .02  # step size in the mesh
C = 1.0  # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, y)

In [5]:
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)

In [6]:
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)

In [7]:
lin_svc = svm.LinearSVC(C=C).fit(X, y)

In [8]:
svr = svm.LinearSVR().fit(X, y)

In [9]:
X_test_dtm = cvect.transform(X_test)

In [10]:
# title for the plots
titles = ['naive bayes',
          'SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel',
          'LinearSVR']

In [11]:
from sklearn import metrics
import math
def accuracy_for_rating_prob(y_test, y_pred):

    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    divation = 0
    l = 0
    for line in conf_matrix:
        l += 1
        c = 0
        for count in line: 
            c += 1
            #Punish big divation more by using abs(c - l)*abs(c - l).
            divation += (abs(c - l)*abs(c - l)) * count
    divation_mean = float(divation) / float(y_test.size)
     
    return 1- (divation_mean / math.sqrt(conf_matrix.size))

In [12]:

for i, clf in enumerate((nb, svc, lin_svc, rbf_svc, poly_svc, svr)):
    y_pred = clf.predict(X_test_dtm)
    #Because the SVR returns a float array, we transform it into an interger one.
    if clf == svr:
        y_pred_int = [0] * y_pred.size
        it = 0
        for c in y_pred:
            y_pred_int[it] = (int(c+0.5))
            if y_pred_int[it] > 5:
                y_pred_int[it] = 5
            if y_pred_int[it] < 1:
                y_pred_int[it] = 1
            it += 1
        y_pred = y_pred_int
    print(titles[i])
    print(metrics.accuracy_score(y_test, y_pred))
    print("rating problem accuracy is " + str(accuracy_for_rating_prob(y_test, y_pred)))
    print(metrics.confusion_matrix(y_test, y_pred))
    print('\n')

naive bayes
0.4632
rating problem accuracy is 0.70424
[[ 43  16   6  69  51]
 [ 16   6   6 173  33]
 [  4   0   9 295  57]
 [  3   0   2 630 249]
 [  2   0   0 360 470]]


SVC with linear kernel
0.52
rating problem accuracy is 0.77416
[[ 79  38  16  25  27]
 [ 44  70  51  54  15]
 [ 15  32  93 170  55]
 [ 13  15  61 557 238]
 [ 13  10  18 290 501]]


LinearSVC (linear kernel)
0.496
rating problem accuracy is 0.74768
[[ 80  40  17  20  28]
 [ 40  72  46  44  32]
 [ 21  33  93 149  69]
 [ 19  18  94 481 272]
 [ 16  12  21 269 514]]


SVC with RBF kernel
0.5244
rating problem accuracy is 0.7736
[[ 68  31  11  49  26]
 [ 29  49  40  95  21]
 [ 12  12  65 218  58]
 [  3   2  24 620 235]
 [  7   2   4 310 509]]


SVC with polynomial (degree 3) kernel
0.3536
rating problem accuracy is 0.69616
[[  0   0   0 185   0]
 [  0   0   0 234   0]
 [  0   0   0 365   0]
 [  0   0   0 884   0]
 [  0   0   0 832   0]]


LinearSVR
0.4476
rating problem accuracy is 0.81096
[[ 22  60  72  31   0]
 [  8  63 