In [1]:
import json
import pandas as pd
import random

In [2]:
class Sentiment:
    Positive = 'POSITIVE'
    Negative = 'NEGATIVE'

class  Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment =self.get_sentiment()


    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.Negative
        else:
            return Sentiment.Positive


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]


    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.Negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.Positive, self.reviews))
        positive_less = positive[:len(negative)]
        self.reviews = negative + positive_less
        random.shuffle(self.reviews)



In [3]:
file_name = "C:/Users/Hp Pc/Downloads/Books_small_10000.json"


reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
    

In [4]:
reviews[0].text

"I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with."

PREP DATA

In [5]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [6]:
train, test = train_test_split(reviews, test_size=0.3, random_state=42)

train_cont = ReviewContainer(train)
test_cont = ReviewContainer(test)


In [7]:
train_cont

<__main__.ReviewContainer at 0x1e4a5cd8450>

In [8]:
train_cont.evenly_distribute()
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

test_cont.evenly_distribute()
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

In [9]:
pd.value_counts(test_y)

  pd.value_counts(test_y)
  pd.value_counts(test_y)


POSITIVE    183
NEGATIVE    183
Name: count, dtype: int64

In [10]:
pd.value_counts(train_y)
#the positive and negative results are greatly not balance

  pd.value_counts(train_y)
  pd.value_counts(train_y)


POSITIVE    461
NEGATIVE    461
Name: count, dtype: int64

In [11]:
#Bag of Words Tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [12]:
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2))

train_vector = vectorizer.fit_transform(train_x)
test_vector = vectorizer.transform(test_x)

CLASSIFICATION

In [13]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [14]:
model_params = {
    'svm': {
        'model' : LinearSVC(random_state=42, dual=False),
        'params': {
           'penalty': ['l1', 'l2'],
            'loss': ['squared_hinge'],
            'C': [1.0, 3.0, 5.0]
        }
    },

    'rf_clf': {

        'model': RandomForestClassifier(random_state=42),
        'params': {
            "n_estimators" : [50, 100, 200, 300],
            "max_depth": [5,10,15,20],
            "max_leaf_nodes": [2,5,7],
            "min_samples_leaf": [2,4,7,10]
        }
    } 
}

In [15]:
scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=5, random_state=42, n_iter=3, error_score='raise')
    clf.fit(train_vector.toarray(), train_y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [16]:
scores

[{'model': 'svm',
  'best_score': 0.8329847238542891,
  'best_params': {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 5.0}},
 {'model': 'rf_clf',
  'best_score': 0.715834312573443,
  'best_params': {'n_estimators': 50,
   'min_samples_leaf': 7,
   'max_leaf_nodes': 7,
   'max_depth': 15}}]

In [17]:
rf_clf = RandomForestClassifier(min_samples_leaf=10, max_leaf_nodes =7, max_depth=5, random_state=42 )

svc_clf = LinearSVC()

In [18]:
rf_clf.fit(train_vector, train_y)

In [19]:
svc_clf.fit(train_vector,train_y)

In [20]:
print(rf_clf.score(test_vector, test_y))     #RandomForest Accuracy on test sets

print(svc_clf.score(test_vector,test_y))

0.7459016393442623
0.8306010928961749


In [21]:
rf_f1 = f1_score(test_y, rf_clf.predict(test_vector), labels = [Sentiment.Positive, Sentiment.Negative], average=None)
svc_f1 = f1_score(test_y, svc_clf.predict(test_vector), average=None, labels = [Sentiment.Positive, Sentiment.Negative])


print(f"Random Forest f1_score:{rf_f1}")
print(f"Linear SVC f1_score: {svc_f1}")

Random Forest f1_score:[0.74520548 0.74659401]
Linear SVC f1_score: [0.82681564 0.8342246 ]


QUALITATIVE TEST

In [22]:
test_words = ["The books weren't so great", 'the novel was educational', 'the story of lagod taught me alot', "the novel wasn't so bad"]

word_vector = vectorizer.transform(test_words)

rf_clf.predict(word_vector)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

In [23]:
svc_clf.predict(word_vector)

#both model are not able to recognize abridged words = weren't and wasn't

array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

In [24]:
test_words_2 = ['Do not buy this book', ' A total waste of time', 'a horrible story line']
word_2_vector = vectorizer.transform(test_words_2)

svc_clf.predict(word_2_vector)

array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [25]:
rf_clf.predict(word_2_vector)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

WORD VECTORS

using spaCy lib to add more meaning and understanding to the texts

In [26]:
import spacy

nlp = spacy.load("en_core_web_md")

In [36]:
doc = [nlp(text) for text in train_x]
#spacy tokenization

In [38]:
print(doc[0].vector)

[-1.9769517e+00 -3.7911931e-01 -1.4299647e+00 -6.9928634e-01
  3.7751863e+00  9.5846702e-04  1.0943570e+00  3.8990650e+00
 -6.4050573e-01 -3.3185150e-02  5.2561693e+00  1.3900437e+00
 -3.1311791e+00  7.7817738e-01  1.4483868e+00  8.5233271e-01
  1.1721945e+00  2.6502523e-01 -9.8066849e-01 -1.3427315e+00
  1.8632922e+00 -2.5373206e-01 -1.3582107e+00 -2.4934994e-01
 -5.2286011e-01 -1.5173619e+00 -2.5139141e+00 -7.6657653e-01
 -2.4675578e-01  1.3773446e+00 -9.1742892e-03 -9.3250585e-01
 -7.6963544e-01 -1.5242305e+00 -2.8177266e+00 -6.5145808e-01
 -6.1500531e-01  8.9944804e-01  1.3936496e+00 -4.5966458e-02
  4.0933782e-01  6.6783255e-01 -3.1039110e-01 -5.2270925e-01
 -5.7781345e-01  1.6231258e+00  2.6500836e-01 -1.8500817e+00
 -5.4270148e-01  7.9655826e-01 -1.1330127e+00  1.4959468e+00
  1.6185006e-01 -3.8872788e+00 -1.8941984e-01  2.7217975e-01
 -3.9780352e-01  1.2766662e+00  1.0161492e+00 -1.3207974e+00
  3.5519412e-01 -1.4050127e+00 -5.5476668e-04 -1.1443893e+00
  1.0989255e+00  1.95822

In [39]:
train_wv = [x.vector for x in doc]

In [40]:
#using the better model from above

svc_clf.fit(train_wv, train_y)

In [42]:
#Testing
doc = [nlp(text) for text in test_words]
test_wv = [x.vector for x in doc]


print(test_words)
svc_clf.predict(test_wv) #work on abridged and biagram words

["The books weren't so great", 'the novel was educational', 'the story of lagod taught me alot', "the novel wasn't so bad"]


array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

In [43]:
doc = [nlp(text) for text in test_words_2]
test_wv2 = [x.vector for x in doc]

print(test_words_2)
svc_clf.predict(test_wv2)

['Do not buy this book', ' A total waste of time', 'a horrible story line']


array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

SAVING THE MODEL

In [28]:
import pickle

In [29]:
with open('C:/Users/Hp Pc/natural_language_processing/svc_model.pkl', 'wb') as f:
    pickle.dump(svc_clf, f)

In [30]:
with open('C:/Users/Hp Pc/natural_language_processing/svc_model.pkl', 'rb') as f:
    classifier = pickle.load(f, encoding='utf-8')

In [31]:
classifier