In [1]:
import re
import time
import random
import numpy as np 
import pandas as pd
import pickle
import sklearn
from collections import defaultdict
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Data Preperation

In [2]:
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer() 
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [3]:
class Sentiment_Analyzer():
    def __init__(self):
        self.load_LDA_Weights()
        self.load_SA_Model()
        self.load_vectorizer()
    
    @staticmethod
    def apply_negation(review):
        review = re.sub(' +', ' ', review)
        negation, delims, result, deleted_delim, words = False, "?.,!:;", [], False, review.split(" ")
        only_letter_pattern = re.compile('[^a-zA-Z]')
        for word in words:
            if (word not in stop_words) or (word == 'not' or word == 'no'):
                if any(delim in word for delim in delims):
                    deleted_delim = True
                cleaned_word = only_letter_pattern.sub('', word).lower()
                negated = "not_" + cleaned_word if negation else cleaned_word
                if ((len(negated) >= 3 and 'not_' not in negated) or ('not_' in negated and len(negated) >= 7)) and len(negated) <= 25:
                    result.append(negated)
                if any(neg in word for neg in ["not", "no"]):
                    negation = not negation
                if deleted_delim:
                    negation = False
                    deleted_delim = False
        return result
    
    @staticmethod
    def edit(review):
        only_letter_pattern, words, results = re.compile('[^a-zA-Z]'), review.split(" "), []
        for word in words:
            if word not in stop_words:
                if len(word) >= 3 and len(word) <= 25:
                    cleaned_word = only_letter_pattern.sub('', word).lower()
                    results.append(cleaned_word)
        return results
        
    
    def normalize_text(self, list_of_reviews, Negate = False):
        results = []
        for i, review in enumerate(list_of_reviews):
            token_roots = []
            lower_cased_review = review.lower()
            tokenized_review = lower_cased_review.split(" ")          
            for token in tokenized_review:
                token_roots.append(lemmatizer.lemmatize(stemmer.stem(token)))
            decontracted_review = contractions.fix(" ".join(token_roots))
            if Negate:
                decontracted_review = self.apply_negation(decontracted_review)
            else:
                decontracted_review = self.edit(decontracted_review)
            results.append(decontracted_review)
        return results
    
    def load_LDA_Weights(self):
        with open("NormalizedLDAW.pickle",'rb') as f:
            self.ldaw = pickle.load(f)
        return None
    
    def load_SA_Model(self):
        with open("LRNegModel.pickle",'rb') as f:
            self.SentimentAnalyzer = pickle.load(f)
        return None
    
    def load_vectorizer(self):
        with open("vectorizer.pickle",'rb') as f:
            self.vectorizer = pickle.load(f)
        return None
        
    
    def find_topic(self, review):
        normalized_review = self.normalize_text(review)[0]
        topic_scores, topic_names = [0,0,0,0,0],['iphone','battery&charge','screen','phone case','product&usability']
        for token in normalized_review:
            results = self.ldaw.loc[self.ldaw['word'] == token]
            for i, score in enumerate(list(results['relevance'])):
                topic_scores[i] += score
        maxValueIndex = np.argmax(topic_scores)
        return topic_names[maxValueIndex]
    
    def find_sentiment_Score(self, review):
        normalized_review = self.normalize_text(review, Negate = True)[0]
        transformed_vectorized_review = self.vectorizer.transform([" ".join(normalized_review)])
        sentiment_result = list(self.SentimentAnalyzer.predict(transformed_vectorized_review))[0]
        sentiment_probabilities = self.SentimentAnalyzer.predict_proba(transformed_vectorized_review)
        print(sentiment_probabilities)
        case_prob, sentiment_score = None, None
        if sentiment_result == 'negative':
            case_prob = sentiment_probabilities[0][0]
            sentiment_score = 5 - (4*case_prob)
        else:
            case_prob = sentiment_probabilities[0][1]
            sentiment_score = 1 + (4*case_prob)
        return sentiment_result, sentiment_score
            
        

In [66]:
df = pd.read_json('reviews_Cell_Phones_and_Accessories_5.json', lines = True)

In [67]:
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [68]:
needed_information = pd.DataFrame(df[['reviewText','overall']])
del(df)

In [69]:
needed_information

Unnamed: 0,reviewText,overall
0,They look good and stick good! I just don't li...,4
1,These stickers work like the review says they ...,5
2,These are awesome and make my phone look so st...,5
3,Item arrived in great time and was in perfect ...,4
4,"awesome! stays on, and looks great. can be use...",5
...,...,...
194434,Works great just like my original one. I reall...,5
194435,Great product. Great packaging. High quality a...,5
194436,"This is a great cable, just as good as the mor...",5
194437,I really like it becasue it works well with my...,5


In [70]:
sentiment_scores = pd.DataFrame(needed_information['overall'])

In [9]:
a = Sentiment_Analyzer()

In [71]:
t1 = time.time()
negated_data =a.normalize_text(needed_information['reviewText'], Negate = True)
t2 = time.time()
print(t2-t1)

318.1023437976837


In [72]:
t1 = time.time()
edited_data = a.normalize_text(needed_information['reviewText'])
t2 = time.time()
print(t2-t1)

302.7244338989258


In [12]:
del(needed_information)

# Topic Modeling Vectorization

In [13]:
vectorizer = CountVectorizer()

In [14]:
sentences = []
for tokens in edited_data:
    sentences.append(" ".join(tokens))

In [15]:
vectorized = vectorizer.fit_transform(sentences)

In [16]:
print(vectorized)

  (0, 87595)	1
  (0, 63508)	2
  (0, 147615)	1
  (0, 85936)	2
  (0, 132087)	1
  (0, 137462)	1
  (0, 13596)	1
  (0, 4496)	1
  (0, 19944)	1
  (0, 139975)	1
  (0, 82012)	1
  (0, 117198)	1
  (0, 78518)	1
  (0, 20751)	1
  (0, 120828)	1
  (1, 147615)	1
  (1, 85936)	1
  (1, 147620)	1
  (1, 175029)	1
  (1, 130346)	1
  (1, 133662)	1
  (1, 43462)	1
  (1, 64664)	1
  (1, 147296)	1
  (1, 112731)	1
  :	:
  (194437, 13591)	1
  (194438, 85936)	1
  (194438, 20751)	1
  (194438, 120828)	1
  (194438, 175029)	2
  (194438, 112731)	2
  (194438, 175917)	1
  (194438, 39619)	1
  (194438, 170683)	1
  (194438, 95229)	1
  (194438, 167308)	1
  (194438, 84339)	1
  (194438, 5784)	1
  (194438, 88140)	1
  (194438, 116035)	1
  (194438, 26729)	1
  (194438, 34033)	2
  (194438, 131486)	1
  (194438, 78949)	1
  (194438, 140029)	1
  (194438, 62535)	1
  (194438, 106810)	1
  (194438, 34145)	1
  (194438, 161758)	1
  (194438, 42702)	1


In [17]:
negated_sentences = []
for tokens in negated_data:
    negated_sentences.append(" ".join(tokens))

In [18]:
negation_vectorizer = CountVectorizer()

In [19]:
negate_vectorized = negation_vectorizer.fit_transform(negated_sentences)

# LDA and Topic Modeling

In [20]:
lda_model = LatentDirichletAllocation(n_components = 5,random_state = 10,evaluate_every = -1,n_jobs = -1)
lda_output = lda_model.fit_transform(vectorized)

In [22]:
topic_names = ["Topic" + str(i) for i in range(1, lda_model.n_components + 1)]
topic_names

['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']

In [23]:
lda_output = np.round(lda_output, 2)

In [24]:
lda_output[0:10]

array([[0.01, 0.53, 0.25, 0.19, 0.01],
       [0.01, 0.95, 0.01, 0.01, 0.01],
       [0.01, 0.95, 0.01, 0.01, 0.01],
       [0.01, 0.57, 0.01, 0.01, 0.41],
       [0.01, 0.01, 0.52, 0.01, 0.44],
       [0.01, 0.8 , 0.16, 0.01, 0.01],
       [0.02, 0.93, 0.02, 0.02, 0.02],
       [0.02, 0.72, 0.02, 0.21, 0.02],
       [0.01, 0.01, 0.27, 0.71, 0.01],
       [0.25, 0.17, 0.57, 0.01, 0.01]])

In [25]:
lda_output.shape

(194439, 5)

In [26]:
review_topics = np.argmax(lda_output, axis=1)+1

In [27]:
review_topics

array([2, 2, 2, ..., 4, 3, 4])

In [28]:
df_topic_keywords = pd.DataFrame(lda_model.components_)

In [29]:
df_topic_keywords.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178256,178257,178258,178259,178260,178261,178262,178263,178264,178265
0,0.203109,7.29267,0.201077,0.200044,0.200107,0.200107,0.200049,0.200013,0.713476,0.227651,...,0.202074,0.200667,0.20157,1.199766,1.199999,0.2,0.200001,1.198788,0.202472,0.200002
1,0.20184,1.762023,0.200021,0.200133,0.200098,0.200098,1.199835,0.213151,0.205464,0.2386,...,0.202374,0.200001,0.200001,0.2,0.2,0.2,1.196412,0.2,0.200003,0.206578
2,0.201615,0.208595,0.200389,0.200278,0.200063,0.200063,0.200029,0.200007,0.200001,0.205028,...,1.197513,0.200135,0.200001,0.200186,0.2,0.200362,0.2,0.2,0.200002,0.200001
3,54.070182,77.709664,1.81513,7.199478,1.199592,1.199592,0.200021,1.186812,0.681058,1.128719,...,3.197952,2.198268,1.198427,0.200047,0.2,4.199637,0.200935,0.201211,1.186475,1.193415
4,1.323254,1.027049,1.583383,0.200066,0.200141,0.200141,0.200065,0.200017,0.200002,0.200001,...,0.200087,0.200929,0.200002,0.2,0.200001,0.2,0.202652,0.2,0.211047,0.200003


In [30]:
df_topic_keywords = df_topic_keywords.T

In [31]:
df_topic_keywords.columns = topic_names
df_topic_keywords.index = vectorizer.get_feature_names()

In [32]:
df_topic_keywords.head(5)

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5
aa,0.203109,0.20184,0.201615,54.070182,1.323254
aaa,7.29267,1.762023,0.208595,77.709664,1.027049
aaaa,0.201077,0.200021,0.200389,1.81513,1.583383
aaaaa,0.200044,0.200133,0.200278,7.199478,0.200066
aaaaaa,0.200107,0.200098,0.200063,1.199592,0.200141


In [34]:
word_dominant_topic = []
for i in range(df_topic_keywords.shape[0]):
    if i % 20000 == 0:
        print(i)
    word_dominant_topic.append(np.argmax(df_topic_keywords.iloc[i]))
word_dominant_topic = pd.DataFrame(word_dominant_topic, index = df_topic_keywords.index)

0
20000
40000
60000
80000
100000
120000
140000
160000


In [35]:
word_dominant_topic = word_dominant_topic.apply(lambda x: x+1)

In [36]:
word_dominant_topic.columns = ['dominant topic']
word_dominant_topic.head(5)

Unnamed: 0,dominant topic
aa,4
aaa,4
aaaa,4
aaaaa,4
aaaaaa,4


In [37]:
df_topic_keywords = df_topic_keywords.T

In [38]:
df_topic_keywords.head(5)

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaa,aaaaaaaaa,aaaaaal,aaaaabatteryst,aaaaalllllllllll,...,zx,zxr,zxrs,zynga,zyxel,zz,zzare,zzz,zzzap,zzzs
Topic1,0.203109,7.29267,0.201077,0.200044,0.200107,0.200107,0.200049,0.200013,0.713476,0.227651,...,0.202074,0.200667,0.20157,1.199766,1.199999,0.2,0.200001,1.198788,0.202472,0.200002
Topic2,0.20184,1.762023,0.200021,0.200133,0.200098,0.200098,1.199835,0.213151,0.205464,0.2386,...,0.202374,0.200001,0.200001,0.2,0.2,0.2,1.196412,0.2,0.200003,0.206578
Topic3,0.201615,0.208595,0.200389,0.200278,0.200063,0.200063,0.200029,0.200007,0.200001,0.205028,...,1.197513,0.200135,0.200001,0.200186,0.2,0.200362,0.2,0.2,0.200002,0.200001
Topic4,54.070182,77.709664,1.81513,7.199478,1.199592,1.199592,0.200021,1.186812,0.681058,1.128719,...,3.197952,2.198268,1.198427,0.200047,0.2,4.199637,0.200935,0.201211,1.186475,1.193415
Topic5,1.323254,1.027049,1.583383,0.200066,0.200141,0.200141,0.200065,0.200017,0.200002,0.200001,...,0.200087,0.200929,0.200002,0.2,0.200001,0.2,0.202652,0.2,0.211047,0.200003


In [39]:
topic_word_names = df_topic_keywords.idxmax(axis=1)

In [40]:
topic_word_names

Topic1     phone
Topic2     phone
Topic3      case
Topic4     charg
Topic5    screen
dtype: object

In [41]:
df_topic_keywords = df_topic_keywords.T

In [42]:
df_topic_keywords.head()

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5
aa,0.203109,0.20184,0.201615,54.070182,1.323254
aaa,7.29267,1.762023,0.208595,77.709664,1.027049
aaaa,0.201077,0.200021,0.200389,1.81513,1.583383
aaaaa,0.200044,0.200133,0.200278,7.199478,0.200066
aaaaaa,0.200107,0.200098,0.200063,1.199592,0.200141


In [43]:
df_topic_keywords['Topic1'].unique()

array([0.20310893, 7.29266995, 0.20107699, ..., 0.20000025, 1.19878825,
       0.20000236])

In [44]:
new_df = pd.DataFrame({'word': df_topic_keywords.index, 'Topic':'Topic1', 'relevance':df_topic_keywords['Topic1']})

In [45]:
new_df = new_df.append(pd.DataFrame({'word': df_topic_keywords.index,'Topic':'Topic2', 'relevance':df_topic_keywords['Topic2']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': df_topic_keywords.index,'Topic':'Topic3', 'relevance':df_topic_keywords['Topic3']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': df_topic_keywords.index,'Topic':'Topic4', 'relevance':df_topic_keywords['Topic4']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': df_topic_keywords.index,'Topic':'Topic5', 'relevance':df_topic_keywords['Topic5']}), ignore_index = True)

### Loads the best weights that have been learned, others shouldnt run the next cell

In [48]:
with open("LDA_CV_unibigram_relevanceWeights.pickle",'rb') as f:
    new_df = pickle.load(f)

### Continuing

In [49]:
list(new_df.loc[new_df['word']=='aa aaa']['relevance'])

[0.2000010672638451,
 0.2054899632530272,
 0.20000027712947194,
 0.20000017643262982,
 1.1945085136737106]

In [50]:
new_df.loc[new_df['Topic'] == 'Topic1']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic1']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic1']['relevance']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [55]:
new_df.loc[new_df['Topic'] == 'Topic5'].sort_values('relevance', ascending=False).head(10)

Unnamed: 0,word,Topic,relevance
12673117,use,Topic5,22977.327362
11901484,phone,Topic5,22155.77068
11227470,great,Topic5,19225.236456
12803278,work,Topic5,15508.370299
12819028,would,Topic5,14811.717304
12699542,veri,Topic5,14043.624336
11793885,one,Topic5,13527.707301
12018930,product,Topic5,12755.964985
11519774,like,Topic5,11422.412373
11424121,it,Topic5,11276.878054


In [56]:
new_df.loc[new_df['Topic'] == 'Topic1']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic1']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic1']['relevance']))
new_df.loc[new_df['Topic'] == 'Topic2']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic2']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic2']['relevance']))
new_df.loc[new_df['Topic'] == 'Topic3']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic3']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic3']['relevance']))
new_df.loc[new_df['Topic'] == 'Topic4']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic4']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic4']['relevance']))
new_df.loc[new_df['Topic'] == 'Topic5']['relevance'] = (new_df.loc[new_df['Topic'] == 'Topic5']['relevance']/np.max(new_df.loc[new_df['Topic'] == 'Topic5']['relevance']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [57]:
new_df.loc[new_df['Topic'] == 'Topic5'].sort_values('relevance', ascending=False).head(10)

Unnamed: 0,word,Topic,relevance
12673117,use,Topic5,22977.327362
11901484,phone,Topic5,22155.77068
11227470,great,Topic5,19225.236456
12803278,work,Topic5,15508.370299
12819028,would,Topic5,14811.717304
12699542,veri,Topic5,14043.624336
11793885,one,Topic5,13527.707301
12018930,product,Topic5,12755.964985
11519774,like,Topic5,11422.412373
11424121,it,Topic5,11276.878054


### If you want to save the result weights, Run the next cell.

In [67]:
with open("LDA_CV_unibigram_relevanceWeights.pickle", 'wb') as f:
    pickle.dump(new_df, f)

# Training Sentiment Analysis Models

### Making Labels

In [73]:
list(sentiment_scores['overall'])

[4,
 5,
 5,
 4,
 5,
 3,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 3,
 4,
 4,
 5,
 2,
 4,
 2,
 4,
 5,
 5,
 1,
 5,
 4,
 2,
 4,
 5,
 5,
 4,
 2,
 5,
 5,
 4,
 2,
 5,
 1,
 5,
 4,
 4,
 4,
 5,
 5,
 4,
 5,
 3,
 5,
 5,
 4,
 4,
 5,
 5,
 4,
 2,
 4,
 5,
 4,
 4,
 5,
 2,
 1,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 5,
 4,
 5,
 1,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 4,
 3,
 5,
 4,
 5,
 1,
 5,
 2,
 2,
 5,
 1,
 5,
 4,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 1,
 5,
 5,
 5,
 1,
 5,
 5,
 4,
 3,
 3,
 1,
 2,
 5,
 2,
 5,
 1,
 3,
 5,
 4,
 1,
 5,
 4,
 2,
 4,
 3,
 5,
 5,
 1,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 4,
 5,
 5,
 2,
 4,
 1,
 5,
 5,
 4,
 5,
 5,
 1,
 3,
 5,
 3,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 5,
 5,
 3,
 1,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 3,
 2,
 5,
 5,
 1,
 3,
 4,
 1,
 3,
 5,
 3,
 1,
 4,
 5,
 5,
 4,
 5,
 3,


In [74]:
sentiments = []
tm = defaultdict(lambda: 0)
for score in sentiment_scores['overall']:
    tm[score] += 1

In [75]:
for key in tm.keys():
    print(key, " : ", tm[key])

4  :  39993
5  :  108664
3  :  21439
1  :  13279
2  :  11064


In [76]:
sentiments = []
for score in sentiment_scores['overall']:
    if int(score) > 3:
        sentiments.append('positive')
    else:
        sentiments.append('negative')

In [78]:
def get_reviews(texts,sentiments,count, label):
    label_reviews ,reviews = [], [[None,None] for i in range(len(texts))]
    for i, text in enumerate(texts):
        reviews[i][0] = text
    for i, lable in enumerate(sentiments):
        reviews[i][1] = lable
        
    random.shuffle(reviews)
    for review in reviews:
        if review[1] == label:
            label_reviews.append(review)
            if len(label_reviews) == count:
                break
    for review in reviews:
        if review[1] != label:
            label_reviews.append(review)
    reviewText, reviewSentiment = [s[0] for s in label_reviews], [s[1] for s in label_reviews]
    return reviewText, reviewSentiment
            


def calculate_sentiment_percentage(sentiments):
    pos , neg = 0,0
    for label in sentiments:
        if label == 'positive':
            pos += 1
        elif label == 'negative':
            neg += 1
    return pos/len(sentiments),  neg/len(sentiments)

In [79]:
print(calculate_sentiment_percentage(sentiments))

(0.7645431214931161, 0.2354568785068839)


In [116]:
reviews_subset, labels_subset = get_reviews(edited_data,sentiments,50000, 'positive')
print(calculate_sentiment_percentage(labels_subset))

(0.5220187509135328, 0.47798124908646716)


In [81]:
neg_reviews_subset, neg_labels_subset = get_reviews(negated_data,sentiments,50000, 'positive')
print(calculate_sentiment_percentage(labels_subset))

(0.5220187509135328, 0.47798124908646716)


## Vectorization

In [82]:
sentences = []
for tokens in reviews_subset:
    sentences.append(" ".join(tokens))

In [83]:
model_edited_vectorizer = TfidfVectorizer()
ed_vectorized = model_edited_vectorizer.fit_transform(sentences)

In [84]:
neg_sentences = []
for tokens in neg_reviews_subset:
    neg_sentences.append(" ".join(tokens))

In [85]:
model_negated_vectorizer = TfidfVectorizer()
negated_vectorized = model_negated_vectorizer.fit_transform(neg_sentences)

## Naive Bayes

### Normal

In [89]:
X_train, X_test, y_train, y_test = train_test_split(ed_vectorized, labels_subset, test_size=0.2, random_state=42)

In [263]:
clf = MultinomialNB()

In [264]:
clf.fit(X_train, y_train)

MultinomialNB()

In [265]:
y_pred = clf.predict(X_test)

In [266]:
acc = metrics.accuracy_score(y_test,y_pred)

In [267]:
acc

0.7772093751631257

In [268]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

    negative       0.81      0.69      0.75      9079
    positive       0.75      0.86      0.80     10078

    accuracy                           0.78     19157
   macro avg       0.78      0.77      0.77     19157
weighted avg       0.78      0.78      0.78     19157

[[6257 2822]
 [1446 8632]]


### Negated

In [88]:
Xn_train, Xn_test, yn_train, yn_test = train_test_split(negated_vectorized, neg_labels_subset, test_size=0.23, random_state=0)

In [270]:
nClf = MultinomialNB()

In [271]:
nClf.fit(Xn_train, yn_train)

MultinomialNB()

In [272]:
yn_pred = nClf.predict(Xn_test)

In [273]:
nAcc = metrics.accuracy_score(yn_test,yn_pred)

In [274]:
nAcc

0.802265490421256

In [275]:
print(classification_report(yn_test,yn_pred))
print(confusion_matrix(yn_test,yn_pred))

              precision    recall  f1-score   support

    negative       0.84      0.73      0.78      9146
    positive       0.78      0.87      0.82     10011

    accuracy                           0.80     19157
   macro avg       0.81      0.80      0.80     19157
weighted avg       0.81      0.80      0.80     19157

[[6669 2477]
 [1311 8700]]


# Logistic Regression

## Normal

In [113]:
logreg = LogisticRegression(C=1e5)

In [114]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0)

In [115]:
lry_pred = logreg.predict(X_test)

In [116]:
acc = metrics.accuracy_score(y_test,lry_pred)

In [117]:
acc

0.7745993631570706

In [118]:
print(classification_report(y_test,lry_pred))
print(confusion_matrix(y_test,lry_pred))

              precision    recall  f1-score   support

    negative       0.76      0.76      0.76      9079
    positive       0.79      0.78      0.79     10078

    accuracy                           0.77     19157
   macro avg       0.77      0.77      0.77     19157
weighted avg       0.77      0.77      0.77     19157

[[6942 2137]
 [2181 7897]]


In [130]:
neglogreg = LogisticRegression()

In [131]:
neglogreg.fit(Xn_train, yn_train)

LogisticRegression()

In [132]:
lry_pred = neglogreg.predict(Xn_test)

In [133]:
acc = metrics.accuracy_score(yn_test,lry_pred)

In [134]:
acc

0.8301861098502042

In [135]:
print(classification_report(yn_test,lry_pred))
print(confusion_matrix(yn_test,lry_pred))

              precision    recall  f1-score   support

    negative       0.83      0.82      0.82     10562
    positive       0.83      0.84      0.84     11468

    accuracy                           0.83     22030
   macro avg       0.83      0.83      0.83     22030
weighted avg       0.83      0.83      0.83     22030

[[8621 1941]
 [1800 9668]]


# Random Forest

## normal

In [281]:
erfclf = RandomForestClassifier(n_estimators=101,max_depth=5, random_state=0)

In [282]:
erfclf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=101, random_state=0)

In [283]:
erf_ypred = erfclf.predict(X_test)

In [284]:
acc = metrics.accuracy_score(y_test,erf_ypred)

In [285]:
acc

0.5654852012319257

In [286]:
print(classification_report(y_test,erf_ypred))
print(confusion_matrix(y_test,erf_ypred))

              precision    recall  f1-score   support

    negative       0.94      0.09      0.16      9079
    positive       0.55      0.99      0.71     10078

    accuracy                           0.57     19157
   macro avg       0.74      0.54      0.43     19157
weighted avg       0.73      0.57      0.45     19157

[[  808  8271]
 [   53 10025]]


## negated

In [90]:
nrfclf = RandomForestClassifier(n_estimators=101,max_depth=2, random_state=0)

In [91]:
nrfclf.fit(Xn_train, yn_train)

RandomForestClassifier(max_depth=2, n_estimators=101, random_state=0)

In [93]:
nrf_ypred = nrfclf.predict(Xn_test)

In [94]:
acc = metrics.accuracy_score(yn_test,nrf_ypred)

In [95]:
acc

0.5222877893781207

## SVM

### Normal

In [96]:
esvmclf = svm.SVC(kernel = 'linear', C=2)

In [97]:
esvmclf.fit(X_train, y_train)

SVC(C=2, kernel='linear')

In [98]:
svm_ypred = esvmclf.predict(X_test)

In [99]:
acc = metrics.accuracy_score(y_test,svm_ypred)

In [101]:
acc

0.8068591115519131

In [103]:
print(classification_report(y_test,svm_ypred))
print(confusion_matrix(y_test,svm_ypred))

              precision    recall  f1-score   support

    negative       0.79      0.80      0.80      9079
    positive       0.82      0.81      0.82     10078

    accuracy                           0.81     19157
   macro avg       0.81      0.81      0.81     19157
weighted avg       0.81      0.81      0.81     19157

[[7261 1818]
 [1882 8196]]


### Negated

In [104]:
nsvmclf = svm.SVC(kernel = 'linear', C=2)

In [105]:
nsvmclf.fit(Xn_train, yn_train)

SVC(C=2, kernel='linear')

In [107]:
nsvm_ypred = nsvmclf.predict(Xn_test)

In [108]:
acc = metrics.accuracy_score(yn_test,nsvm_ypred)

In [109]:
acc

0.826781661370858

In [110]:
print(classification_report(yn_test,nsvm_ypred))
print(confusion_matrix(yn_test,nsvm_ypred))

              precision    recall  f1-score   support

    negative       0.82      0.81      0.82     10562
    positive       0.83      0.84      0.83     11468

    accuracy                           0.83     22030
   macro avg       0.83      0.83      0.83     22030
weighted avg       0.83      0.83      0.83     22030

[[8582 1980]
 [1836 9632]]


In [18]:
def get_topic(normalized_review):
    topic_scores, topic_names = [0,0,0,0,0],['','','','','']
    for token in normalized_review:
        results = self.ldaw.loc[self.ldaw['word'] == token]
        for i, score in enumretate(list(results['relevance'])):
            topic_scores[i] += score
    maxValueIndex = np.argmax(topic_scores)
    return topic_names[maxValueIndex]

In [28]:
a = Sentiment_Analyzer()

In [161]:
u = a.normalize_text(["This is the first battery case I have had for my Galaxy S4. The S4 fits very well, is slim and doesn't add much weight to the Galaxy S4. It doubles the battery life. You can charge either the battery, the phone or both. There is a handy on-off switch with leds to indicate the level of charge.The battery case came on time and was packaged well. Well worth the price."], Negate=True)

0


In [40]:
print(a.find_topic(u[0]))

phone case


In [44]:
print(len(reviews_subset), len(labels_subset))
print(reviews_subset[0])

95782 95782
['tri', 'hammer', 'cell', 'phone', 'layer', 'commando', 'provid', 'protect', 'phone', 'without', 'case', 'it', 'see', 'ani', 'ding', 'anything', 'even', 'drop', 'time', 'without', 'ani', 'scratches']


In [134]:
count = defaultdict(lambda: defaultdict(lambda : 0))

In [36]:
for key in count.keys():
    for k in count[key].keys():
        print(key, ":", k, ":", count[key][k])

battery&charge : positive : 176
product&usability : positive : 2324
iphone : positive : 576
phone case : positive : 273
screen : positive : 156


In [38]:
count['battery&charge']['negative'] = 80
count['product&usability']['negative'] = 1758
count['iphone']['negative'] =  179
count['phone case']['negative'] =  321
count['screen']['negative'] = 103

In [121]:
tmp = [(x[0],x[1]) for x in zip(reviews_subset,labels_subset)]

In [122]:
random.shuffle(tmp)

In [123]:
for i , x in enumerate(tmp):
    if i > 10:
        break
    print(x[1])

negative
negative
positive
positive
positive
negative
negative
negative
negative
positive
negative


In [135]:
for i,pair in enumerate(tmp):
    if i == 1500:
        break
    if i% 100 == 0:
        print(i)
    revtopic = a.find_topic(pair[0])
    count[revtopic][pair[1]] += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


In [136]:
for key in count.keys():
    for k in count[key].keys():
        print(key, ":", k, ":", count[key][k])

product&usability : negative : 474
product&usability : positive : 521
phone case : negative : 83
phone case : positive : 52
iphone : positive : 123
iphone : negative : 112
screen : negative : 30
screen : positive : 27
battery&charge : positive : 36
battery&charge : negative : 42


## Test

In [162]:
revvec = model_negated_vectorizer.transform([" ".join(u[0])])

In [152]:
revvec.shape

(1, 129733)

In [73]:
ed_vectorized.shape

(95782, 110025)

In [None]:
model_negated_vectorizer = TfidfVectorizer()
negated_vectorized = model_negated_vectorizer.fit_transform(neg_sentences)

In [214]:
u = a.normalize_text(["This is the first battery case I have had for my Galaxy S4. The S4 fits very well, is slim and doesn't add much weight to the Galaxy S4. It doubles the battery life. You can charge either the battery, the phone or both. There is a handy on-off switch with leds to indicate the level of charge.The battery case came on time and was packaged well. Well worth the price."], Negate = True)[0]

0


In [215]:
revvec = model_negated_vectorizer.transform([" ".join(u)])

In [216]:
neglogreg.predict(revvec)

array(['positive'], dtype='<U8')

In [217]:
neglogreg.predict_proba(revvec)

array([[0.01993882, 0.98006118]])

In [158]:
negated_vectorized[0]

<1x129733 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [195]:
with open("vectorizer.pickle", 'wb') as f:
    pickle.dump(model_negated_vectorizer, f)

In [196]:
with open("LRNegModel.pickle", 'wb') as f:
    pickle.dump(neglogreg, f)

In [271]:
a = Sentiment_Analyzer()

In [223]:
print(a.find_sentiment_Score(["This case for some reason is peeling, there isn't much left of the orginal skin, i  loved the case with pink being my favorite color but i wouldn't recommend this specific one for anyone."]))

0
[[0.72743339 0.27256661]]
('negative', 2.09026645506371)


In [224]:
print(a.find_sentiment_Score(["This is the first battery case I have had for my Galaxy S4. The S4 fits very well, is slim and doesn't add much weight to the Galaxy S4. It doubles the battery life. You can charge either the battery, the phone or both. There is a handy on-off switch with leds to indicate the level of charge.The battery case came on time and was packaged well. Well worth the price."]))

0
[[0.01993882 0.98006118]]
('positive', 4.92024472494391)


In [272]:
a.ldaw

Unnamed: 0,word,Topic,relevance
0,aa,Topic1,0.000143
1,aa aaa,Topic1,0.000143
2,aa batteri,Topic1,0.000143
3,aa batteries,Topic1,0.000143
4,aa batterypowered,Topic1,0.000143
...,...,...,...
12846945,zzz doe,Topic5,0.000009
12846946,zzzap,Topic5,0.000009
12846947,zzzap refundrevis,Topic5,0.000009
12846948,zzzs,Topic5,0.000009


In [274]:
a.find_topic(["This case for some reason is peeling, there isn't much left of the orginal skin, i  loved the case with pink being my favorite color but i wouldn't recommend this specific one for anyone."])

0
[0.5415686063592394, 0.8967462101018147, 1.0581275315356513, 2.8650851822266263, 2.7302151023993138]


'phone case'

# Normalizing LDA Weights

In [233]:
tmp = pd.DataFrame(a.ldaw)

In [236]:
tmp_list = []
for i in range(1,6):
    tmp_list.append(tmp.loc[tmp['Topic'] == 'Topic'+str(i)])
    

In [247]:
for i in range(5):
    v = np.max(list(tmp_list[i]['relevance']))
    tmp_list[i]['relevance'] = (tmp_list[i]['relevance']/v)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [244]:
list(tmp_list[4]['relevance'])/z

array([5.60159676e-05, 5.19863992e-05, 2.11207061e-05, ...,
       8.70431165e-06, 8.70426339e-06, 8.70426339e-06])

In [242]:
z = np.max(list(tmp_list[4]['relevance']))

In [252]:
tmp_list[1].sort_values('relevance', ascending=False).head(10)

Unnamed: 0,word,Topic,relevance
2947815,charg,Topic2,1.0
4964947,use,Topic2,0.562245
2956516,charger,Topic2,0.546971
2756010,batteri,Topic2,0.515879
5095108,work,Topic2,0.379402
4193314,phone,Topic2,0.378801
4085715,one,Topic2,0.346064
4962310,usb,Topic2,0.339843
3160685,devic,Topic2,0.300153
4272374,power,Topic2,0.283719


In [266]:
new_df = pd.DataFrame({'word': tmp_list[0]['word'], 'Topic':'Topic1', 'relevance':tmp_list[0]['relevance']})

In [267]:
new_df = new_df.append(pd.DataFrame({'word': tmp_list[1]['word'],'Topic':'Topic2', 'relevance':tmp_list[1]['relevance']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': tmp_list[2]['word'],'Topic':'Topic3', 'relevance':tmp_list[2]['relevance']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': tmp_list[3]['word'],'Topic':'Topic4', 'relevance':tmp_list[3]['relevance']}), ignore_index = True)
new_df = new_df.append(pd.DataFrame({'word': tmp_list[4]['word'],'Topic':'Topic5', 'relevance':tmp_list[4]['relevance']}), ignore_index = True)

In [268]:
new_df

Unnamed: 0,word,Topic,relevance
0,aa,Topic1,0.000143
1,aa aaa,Topic1,0.000143
2,aa batteri,Topic1,0.000143
3,aa batteries,Topic1,0.000143
4,aa batterypowered,Topic1,0.000143
...,...,...,...
12846945,zzz doe,Topic5,0.000009
12846946,zzzap,Topic5,0.000009
12846947,zzzap refundrevis,Topic5,0.000009
12846948,zzzs,Topic5,0.000009


In [269]:
with open('NormalizedLDAW.pickle','wb') as f:
    pickle.dump(new_df, f)