## Using Enums for sentiment detection

In [89]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self,reviews):
        self.reviews=reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
        
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE,self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

# Data Import and listing

In [1]:
!git init

Initialized empty Git repository in /mnt/.git/


In [2]:
!git clone https://github.com/KeithGalli/sklearn.git

Cloning into 'sklearn'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 45 (delta 8), reused 38 (delta 3), pack-reused 0[K
Unpacking objects: 100% (45/45), done.


In [3]:
import json

In [77]:
file_name = '/mnt/sklearn/data/sentiment/Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

len(reviews)


10000

## Prep data

In [90]:
from sklearn.model_selection import train_test_split

training,test=train_test_split (reviews,test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)
len(cont.reviews)

872

In [92]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

### Bags of words

In [93]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [111]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

### Classifcation

#### Linear SVM

In [112]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors,train_y)
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### decision Tree

In [113]:
from sklearn.tree import DecisionTreeClassifier

clf_dec =  DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### naive Bayers

In [114]:
from sklearn.naive_bayes import GaussianNB

clf_gnb =  GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(),train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

### Logistic Regressin

In [115]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors,train_y)
clf_log.predict(test_x_vectors[0])



array(['POSITIVE'], dtype='<U8')

## Evaluation

In [116]:
print("Mean Accuracy SVM Model:",clf_svm.score(test_x_vectors,test_y))
print("Mean Accuracy Decision Tree Model:",clf_dec.score(test_x_vectors,test_y))
print("Mean Accuracy Naive Bayes Model:",clf_gnb.score(test_x_vectors.toarray(),test_y))
print("Mean Accuracy Logistic Regression Model:",clf_log.score(test_x_vectors,test_y))

Mean Accuracy SVM Model: 0.8076923076923077
Mean Accuracy Decision Tree Model: 0.6514423076923077
Mean Accuracy Naive Bayes Model: 0.6610576923076923
Mean Accuracy Logistic Regression Model: 0.8028846153846154


In [117]:
from  sklearn.metrics import f1_score

print("Mean F1 Score SVM Model:",f1_score(test_y,clf_svm.predict(test_x_vectors),average=None, labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print("Mean F1 Score Decision tree Model:",f1_score(test_y,clf_dec.predict(test_x_vectors),average=None, labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print("Mean F1 Score Naive Bayes Model:",f1_score(test_y,clf_gnb.predict(test_x_vectors.toarray()),average=None, labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print("Mean F1 Score Logistic Regression Model:",f1_score(test_y,clf_log.predict(test_x_vectors),average=None, labels=[
    Sentiment.NEGATIVE,Sentiment.POSITIVE]))


Mean F1 Score SVM Model: [0.80952381 0.80582524]
Mean F1 Score Decision tree Model: [0.66042155 0.64197531]
Mean F1 Score Naive Bayes Model: [0.66508314 0.65693431]
Mean F1 Score Logistic Regression Model: [0.8047619  0.80097087]


## Tuning our Model ( with Grid Search)

In [121]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vectors,train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [122]:
print("Mean Accuracy SVM Model:",clf_svm.score(test_x_vectors,test_y))
print("Mean Accuracy Tuned SVM Model:",clf.score(test_x_vectors,test_y))

Mean Accuracy SVM Model: 0.8076923076923077
Mean Accuracy Tuned SVM Model: 0.8076923076923077


## Saving the Model

In [None]:
# removing complex than tbfif
# remving punctuations

#### Save Model

In [124]:
import pickle

with open ('/mnt/sklearn/Sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)

#### Load Model

In [128]:
with open ('/mnt/sklearn/Sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)s

SyntaxError: invalid syntax (<ipython-input-128-7943e3ff96e6>, line 2)