In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
import glob

Getting the data

In [3]:
pos_fl = glob.glob('review_polarity/txt_sentoken/pos/*txt')
neg_fl = glob.glob('review_polarity/txt_sentoken/neg/*txt')

In [4]:
print 'There are ' + str(len(pos_fl)) + ' files with positive examples and ' + str(len(neg_fl)) + ' files with negative examples'

There are 1000 files with positive examples and 1000 files with negative examples


Splitting data set in train, validation and test (80,10,10)

In [5]:
np.random.shuffle(pos_fl)
np.random.shuffle(neg_fl)

In [6]:
test_pos_files = pos_fl[:100]
test_neg_files = neg_fl[:100]
val_pos_files = pos_fl[100:200]
val_neg_files = neg_fl[100:200]
train_pos_files = pos_fl[200:300]
train_neg_files = neg_fl[200:300]

In [7]:
X_test = []
y_test = []
for file_name in test_pos_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [1 for i in range(len(data))]
    X_test = X_test + data
    y_test = y_test + labels
    f.close()
for file_name in test_neg_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [-1 for i in range(len(data))]
    X_test = X_test + data
    y_test = y_test + labels
    f.close()

X_test, y_test = shuffle(np.array(X_test), np.array(y_test), random_state=2016)

In [8]:
X_val = []
y_val = []
for file_name in val_pos_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [1 for i in range(len(data))]
    X_val = X_val + data
    y_val = y_val + labels
    f.close()
for file_name in val_neg_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [-1 for i in range(len(data))]
    X_val = X_val + data
    y_val = y_val + labels
    f.close()
X_val, y_val = shuffle(np.array(X_val), np.array(y_val), random_state=2016)

In [9]:
X_train = []
y_train = []
for file_name in train_pos_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [1 for i in range(len(data))]
    X_train = X_train + data
    y_train = y_train + labels
    f.close()
for file_name in train_neg_files:
    f = open(file_name)
    data = [line[:-1] for line in f]
    labels = [-1 for i in range(len(data))]
    X_train = X_train + data
    y_train = y_train + labels
    f.close()
X_train, y_train = shuffle(np.array(X_train), np.array(y_train), random_state=2016)

Transforming text to vectors

In [10]:
vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)

In [11]:
V_train = vectorizer.fit_transform(X_train)
V_val = vectorizer.transform(X_val)
V_test = vectorizer.transform(X_test)

Exploring different configurations with validation and training

Starting with a rbf (radial basis function) kernel: gaussian on a absolute value of vector difference.

In [34]:
clf_rbf = svm.SVC()
clf_rbf.fit(V_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [35]:
p_rbf_val =clf_rbf.predict(V_val)

In [36]:
print classification_report(y_val, p_rbf_val)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00      3020
          1       0.53      1.00      0.69      3433

avg / total       0.28      0.53      0.37      6453



Pretty bad! Predicting all negatives

Next, linear kernel

In [38]:
clf_linear = svm.SVC(kernel='linear')
clf_linear.fit(V_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [39]:
p_linear_val =clf_linear.predict(V_val)

In [40]:
print classification_report(y_val, p_linear_val)

             precision    recall  f1-score   support

         -1       0.57      0.59      0.58      3020
          1       0.62      0.60      0.61      3433

avg / total       0.60      0.60      0.60      6453



Better!

Once we are satisfied with the training, we predict and measure accuray on the test set

In [41]:
p_test =clf.predict(V_test)

In [42]:
print classification_report(y_test, p_test)

             precision    recall  f1-score   support

         -1       0.58      0.57      0.58      3278
          1       0.57      0.58      0.58      3193

avg / total       0.58      0.58      0.58      6471



Similar performance as before