In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Reading the Data
df = pd.read_csv('out.csv', index_col = 0)
print df.head()
print '\n'
print 'Length of Data:',len(df),'Files'
print '\n'
print 'Number of Positive & Negative Labelled Data:'
print df.Polarity.value_counts()

  Polarity                                               Text
0      pos  ["films adapted comic books plenty success whe...
1      pos  starters created alan moore eddie campbell bro...
2      pos  say moore campbell thoroughly researched subje...
3      pos  book graphic novel pages long includes nearly ...
4      pos                        words dismiss film source .


Length of Data: 59836 Files


Number of Positive & Negative Labelled Data:
pos    37484
neg    22352
Name: Polarity, dtype: int64


In [3]:
# Mapping text labels to numeric labels, since classifiers cannot work with textual data 
df.Polarity = df.Polarity.map({'pos':1,'neg':0})

In [4]:
# Splitting the data into Train and Test sets
# 'Text' is the independent variable and 'Polarity' is the dependent variable

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.Text, df.Polarity, test_size = 0.25, random_state = 10) 
print 'X Training:',(X_train.shape)
print 'X Testing:',(X_test.shape)
print 'Y Training:',(Y_train.shape)
print 'Y Testing:',(Y_test.shape)

X Training: (44877L,)
X Testing: (14959L,)
Y Training: (44877L,)
Y Testing: (14959L,)


In [9]:
# Creating a Sparse Matrix of Term Frequency Identifiers and its Inverse Document Frequency of words in the dataset
# The 'TfidfVectorizer' combines the functionality of 'CountVectorizer' and 'TfidfTransformer'

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range = (1,1), analyzer=u'word')

X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

<h1> Creating Classification Models

<h3> Multinomial NaiveBayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

mnb.fit(X_train_dtm, Y_train)
mnb_predict = mnb.predict(X_test_dtm)

print 'Accuracy:',mnb.score(X_test_dtm,Y_test)

Accuracy: 0.776923591149


<h3> Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

LR.fit(X_train_dtm, Y_train)
LR_predict = LR.predict(X_test_dtm)

print 'Accuracy:',LR.score(X_test_dtm, Y_test)

Accuracy: 0.827127481784


<h3> Stochastic Gradient Descent (SGD) Classifier

In [12]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

sgd_model = sgd.fit(X_train_dtm, Y_train)
sgd_predict = sgd.predict(X_test_dtm)

print 'Accuracy:',sgd.score(X_test_dtm, Y_test)

Accuracy: 0.795440871716


<h3> Bernoulli's Naive Bayes

In [13]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

bnb_model = bnb.fit(X_train_dtm, Y_train)
bnb_predict = bnb.predict(X_test_dtm)

print 'Accuracy:',bnb.score(X_test_dtm, Y_test)

Accuracy: 0.845043117855


<h3> Passive Aggressive Classifier

In [14]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier()

pac_model = pac.fit(X_train_dtm, Y_train)
pac_predict = pac.predict(X_test_dtm)

print 'Accuracy:',pac.score(X_test_dtm, Y_test)

Accuracy: 0.855605321211


<h3> Voting Classifier</h3>
<i> A Voting Classifier allows you to implement multilpe classification models together without having to run each of them individually. 

In [15]:
from sklearn.ensemble import VotingClassifier
vclf = VotingClassifier(estimators=[('lr', LR), ('MultinomialNB', mnb), ('SGDClassifier', sgd),('BernoulliNB',bnb),('PassiveAggressiveClassifier',pac)], voting='hard')

vclf.fit(X_train_dtm, Y_train)
vclf.predict(X_test_dtm)

print 'Accuracy:',vclf.score(X_test_dtm, Y_test)

Accuracy: 0.834213516946
