# Sentiment Analysis Training and Testing

### Importing required libraries

In [587]:
import pandas as pd
import numpy as np
import sqlite3
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [569]:
con = sqlite3.connect('training data/database.sqlite')
                      
data = pd.read_sql("""
SELECT * FROM Reviews
""",con)

In [480]:
data.shape

(568454, 10)

In [481]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [482]:
sorted_data = data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [501]:
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False,ignore_index=True)
final.shape

(393933, 10)

In [502]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [503]:
final = final[['Text','Score']]

In [575]:
final.shape

(393931, 2)

In [576]:
final['Score'].value_counts()

5    250966
4     56086
1     36307
3     29772
2     20800
Name: Score, dtype: int64

In [577]:
train = final[:315144]
test = final[315145:]

In [578]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [579]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [581]:
preprocessed_train_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(train['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_train_reviews.append(sentance.strip())

100%|████████████████████████████████████████████████████████████████████████| 315144/315144 [03:05<00:00, 1700.08it/s]


In [584]:
preprocessed_train_reviews[:5]

['junei saw charming groupof roses beginto droopi pepped upwith chicken soup sprinkle oncesprinkle twicesprinkle chicken soupwith ricethis great book teach children months year repetition phrases funny little stories accompanying pictures make ideal bedtime read not nearly good sendak books like wild things pierre boy not care still carries unique brand charm',
 'fun way children learn months year learn poems throughout school year like handmotions invent poem',
 'grew reading sendak books watching really rosie movie incorporates love son loves however miss hard cover version paperbacks seem kind flimsy takes two hands keep pages open',
 'get movie sound track sing along carol king great stuff whole extended family knows songs heart quality kids storytelling music',
 'entertaining rhyming story cleaver catchy illustrations imaginative fit right however paperback somewhat small flimsy would opt bigger edition']

### Vectoize the training data

In [586]:
tf_idf_vect = TfidfVectorizer(ngram_range=(2,2), min_df=10)
tf_idf_vect.fit(preprocessed_train_reviews)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(preprocessed_train_reviews)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['abdominal pain', 'ability buy', 'ability get', 'ability make', 'able achieve', 'able actually', 'able add', 'able afford', 'able bake', 'able beat']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (315144, 153801)
the number of unique words including both unigrams and bigrams  153801


### Training the model

In [589]:
randomClassifier = RandomForestClassifier( criterion = 'entropy',n_estimators = 200)
randomClassifier.fit(final_tf_idf,train['Score'])

KeyboardInterrupt: 

### Testing model

In [None]:
preprocessed_test_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(test['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_test_reviews.append(sentance.strip())

In [None]:
test_tf_idf = tf_idf_vect.transform(preprocessed_test_reviews)

In [None]:
predictions = randomClassifier.predict(test_tf_idf)

In [None]:
print(=*50)
print("Confusion Matrix: ")
matrix = confusion_matrix(test['Score'],predictions)
print(matrix)
print(=*50)
print("Accuracy_score : ")
score = accuracy_score(test['Score'],predictions)
print(score)
print(=*50)
print("Classification_report : ")
report = classification_report(test['Score'],predictions)
print(report)