In [1]:
import pandas as pd

# train Data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")

# test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

trainData.sample(frac=1).head(5)

Unnamed: 0,Content,Label
383,""" the end of the affair "" is a dark and moody...",pos
425,one year has passed since the last time we saw...,pos
736,"as the film opens up , expectant unwed mother ...",pos
1461,""" first rule of fight club is , don't talk ab...",neg
1295,""" there's nothing new under the sun "" is a ph...",neg


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

In [3]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()

classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()

prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()

time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['Label'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])

Training time: 5.555349s; Prediction time: 0.536156s
positive:  {'precision': 0.9191919191919192, 'recall': 0.91, 'f1-score': 0.9145728643216081, 'support': 100}
negative:  {'precision': 0.9108910891089109, 'recall': 0.92, 'f1-score': 0.9154228855721394, 'support': 100}


In [4]:
review = """SUPERB, I AM IN LOVE IN THIS PHONE"""

review_vector = vectorizer.transform([review]) # vectorizing

print(classifier_linear.predict(review_vector))

['pos']


In [5]:
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------

In [6]:
from numpy.random import RandomState

fileTest = pd.read_csv('train.txt', sep='\t')
fileTest.to_csv('train.csv', index=True)
print(len(fileTest.index))

rng = RandomState()
train = fileTest.sample(frac=0.9, random_state=rng)
test = fileTest.loc[~fileTest.index.isin(train.index)]

fileTest.head()

9999


Unnamed: 0,=VeryGood=,Stamp and Scrape,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,=Unsatisfactory=,Did not even tingle for ant aging effect,,,
1,=VeryGood=,"Well, I like it",,,
2,=Excellent=,Much better than what I was using.,,,
3,=Unsatisfactory=,Disappointed with Usage and Dosage,,,
4,=Poor=,Don't work?,,,


In [15]:
from numpy.random import RandomState
import pandas as pd

textFile = pd.read_csv('train.txt', sep='\t', usecols=[0,1], header=None, names=["Label", "Review"])

rng = RandomState()
train = textFile.sample(frac=0.9, random_state=rng)
test = textFile.loc[~textFile.index.isin(train.index)]

print(len(textFile.index))

textFile.head()

print(test['Label'])

10000
7            =Excellent=
23           =Excellent=
30      =Unsatisfactory=
61            =VeryGood=
65            =VeryGood=
              ...       
9945              =Good=
9953         =Excellent=
9961    =Unsatisfactory=
9966    =Unsatisfactory=
9971         =Excellent=
Name: Label, Length: 1000, dtype: object


In [10]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

train_vectors = vectorizer.fit_transform(train['Review'])
test_vectors = vectorizer.transform(test['Review'])

  (1, 736)	1.0
  (2, 878)	0.5324049218213973
  (2, 477)	0.48124755059642865
  (2, 300)	0.3619145515741788
  (2, 208)	0.5949484445093968
  (3, 898)	0.3042967944074409
  (3, 859)	0.4789985093845005
  (3, 501)	0.24070654030600436
  (3, 436)	0.4668434021359808
  (3, 361)	0.3337522184676491
  (3, 68)	0.4248578082787837
  (3, 22)	0.3319439526023478
  (4, 671)	0.743943394075088
  (4, 274)	0.6682426403725209
  (5, 708)	0.9294201126143076
  (5, 300)	0.36902337902632665
  (6, 423)	0.8961605720928977
  (6, 37)	0.4437299054899617
  (7, 898)	0.37317645966020935
  (7, 895)	0.5546022716519308
  (7, 775)	0.4955521874762619
  (7, 509)	0.5546022716519308
  (8, 794)	0.3265165122250394
  (8, 620)	0.6303756972298689
  (8, 541)	0.385824224660968
  :	:
  (990, 899)	0.4011715291989211
  (990, 869)	0.3876785654852581
  (990, 731)	0.572043995315692
  (990, 158)	0.6012756451468428
  (991, 743)	0.8370580944268312
  (991, 335)	0.5471140160464928
  (992, 541)	0.34476213411921103
  (992, 515)	0.3242791001795409
  (9

In [9]:
# Perform classification with SVM, kernel=linear
#classifier_linear = svm.SVC(kernel='linear')
classifier_linear = svm.SVC(kernel='rbf')
#classifier_linear = svm.SVC(kernel='poly')
#classifier_linear = svm.SVC(kernel='sigmoid')
t0 = time.time()

classifier_linear.fit(train_vectors, train['Label'])
t1 = time.time()

prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()

time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
#print(prediction_linear)

count = 0
for p, t in zip(prediction_linear, test['Label']):
    if p == t:
        count = count + 1

print("Accuracy:",count / len(prediction_linear) * 100,"%")




report = classification_report(test['Label'], prediction_linear, output_dict=True)

#print(prediction_linear)
print('1: ', report['=Excellent='])
print('2: ', report['=VeryGood='])
print('3: ', report['=Good='])
print('4: ', report['=Unsatisfactory='])
print('5: ', report['=Poor='])

Training time: 3.653224s; Prediction time: 0.238998s
Accuracy: 46.300000000000004 %
1:  {'precision': 0.6086956521739131, 'recall': 0.5971563981042654, 'f1-score': 0.6028708133971292, 'support': 211}
2:  {'precision': 0.405, 'recall': 0.42857142857142855, 'f1-score': 0.416452442159383, 'support': 189}
3:  {'precision': 0.4772727272727273, 'recall': 0.4097560975609756, 'f1-score': 0.4409448818897638, 'support': 205}
4:  {'precision': 0.4473684210526316, 'recall': 0.3881278538812785, 'f1-score': 0.41564792176039117, 'support': 219}
5:  {'precision': 0.3832599118942731, 'recall': 0.4943181818181818, 'f1-score': 0.43176178660049624, 'support': 176}


In [22]:
review = "This was the best thing I bought"

review_vector = vectorizer.transform([review]) # vectorizing

print(classifier_linear.predict(review_vector))

['=Excellent=']
