In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("/Users/terminus/Github/Springboard/Sample Data/labeledTrainData.tsv", delimiter="\t")
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
print("Samples per class: {}".format(np.bincount(data.sentiment)))

Samples per class: [12500 12500]


In [4]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    x_train = data[:n].copy()
    x_test = data[n:].copy()
    y_train = y[:n].copy()
    y_test = y[n:].copy()
    return x_train,x_test,y_train,y_test

In [5]:
vectorizer = CountVectorizer()

In [6]:
x_train,x_test,y_train,y_test = simple_split(data.review,data.sentiment,len(data))
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
print(x_train)

(17500,) (7500,) (17500,) (7500,)


In [7]:
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [8761 8739]
Samples per class: [3739 3761]


In [8]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.fit_transform(x_test)

In [9]:
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20: {}".format(feature_names[:20]))
print("Features 19500 to 19530: {}".format(feature_names[19500:19530]))
print("Every 2000th Feature: {}".format(feature_names[::2000]))


Number of features: 45538
First 20: ['00', '000', '00001', '001', '007', '0080', '00am', '00s', '01', '02', '020410', '03', '04', '05', '06', '07', '08', '089', '09', '0ne']
Features 19500 to 19530: ['houghton', 'hound', 'hounded', 'hounding', 'hounds', 'hour', 'hourglass', 'hourly', 'hours', 'house', 'houseboat', 'housebound', 'housecleaning', 'housed', 'household', 'households', 'housekeeper', 'housekeepers', 'housekeeping', 'housemaid', 'houseman', 'housemann', 'housemates', 'houses', 'housesitter', 'housewife', 'housewives', 'housework', 'housing', 'houston']
Every 2000th Feature: ['00', 'ands', 'beija', 'byron', 'collaborates', 'damsels', 'donnybrook', 'exams', 'freight', 'hades', 'imaginative', 'kaminsky', 'lord', 'minette', 'nun', 'perverse', 'purses', 'riget3', 'sewer', 'splices', 'tantric', 'ugh', 'warriors']


In [10]:
vectorizer.vocabulary_

{'james': 21424,
 'bishop': 4458,
 'matt': 25164,
 'stasi': 38392,
 'goes': 17161,
 'to': 40949,
 'mental': 25620,
 'illness': 19962,
 'facility': 14433,
 'for': 15651,
 'medical': 25451,
 'residence': 33571,
 'assignment': 2777,
 'with': 44708,
 'dr': 12182,
 'mccort': 25283,
 'bruce': 5594,
 'paynes': 29610,
 'there': 40549,
 'he': 18505,
 'realizes': 32689,
 'that': 40480,
 'many': 24809,
 'interns': 20953,
 'are': 2472,
 'being': 4002,
 'killed': 22324,
 'by': 5988,
 'the': 40489,
 'ripper': 34074,
 'who': 44429,
 'takes': 39897,
 'their': 40505,
 'souls': 37672,
 'devil': 11049,
 'in': 20216,
 'cult': 9727,
 'promoted': 31586,
 'this': 40625,
 'story': 38708,
 'is': 21248,
 'so': 37408,
 'absurd': 738,
 'and': 1969,
 'imbecile': 20013,
 'it': 21304,
 'impossible': 20159,
 'write': 44999,
 'summary': 39256,
 'dialogs': 11122,
 'ridiculous': 33968,
 'specially': 37826,
 'when': 44334,
 'character': 6971,
 'of': 28262,
 'helen': 18685,
 'blonde': 4701,
 'fiancé': 14963,
 'arrives': 2

In [11]:
i = 45000
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(x_train[j:j+7,i:i+10].todense(), columns=words)

Unnamed: 0,writer,writers,writes,writeup,writhe,writhed,writhing,writing,writings,written
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [12]:
scores = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)
print("MEan CV acc: {:.2f}".format(np.mean(scores)))



MEan CV acc: 0.88


In [15]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(x_test, y_test)))

Training set score: 0.999


ValueError: X has 45538 features per sample; expecting 65005

In [None]:
pred_logreg = logreg.predict(x_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("confusion matrix: \n {}".format(confusion))

In [None]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
print("Training set score: {:.3f}".format(nb.score(x_train, y_train)))
print("Test set score: {:.3f}".format(nb.score(x_test, y_test)))

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
print("Training set score: {:.3f}".format(rf.score(x_train, y_train)))
print("Test set score: {:.3f}".format(rf.score(x_test, y_test)))

In [16]:
sample = "We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the \"trucker\" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that's just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!"

print(logreg.predict(vectorizer.transform([sample]))[0])
print(rf.predict(vectorizer.transform([sample]))[0])
print(nb.predict(vectorizer.transform([sample]))[0])

ValueError: X has 45538 features per sample; expecting 65005