In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from collections import Counter

In [2]:
#import data in csv file
review_static = pd.read_csv('Amazon_Unlocked_Mobile.csv')
review = review_static.copy(deep=True)
review = pd.DataFrame(review)

In [3]:
review.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0
9,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,3,It's battery life is great. It's very responsi...,0.0


In [4]:
data_dropped = review.drop(['Product Name','Brand Name'], axis = 1)
cleaned_data = data_dropped.drop(data_dropped[data_dropped['Rating'].isnull() | data_dropped['Reviews'].isnull() | data_dropped['Review Votes'].isnull() | data_dropped['Price'].isnull()].index)
cleaned_data.head(10)
punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~' 
translations = str.maketrans(dict.fromkeys(punctuations, ''))

cleaned_data['Reviews'] = '|aa'.join(cleaned_data['Reviews'].tolist()).translate(translations).split('|aa')

In [5]:
cleaned_data.head()

Unnamed: 0,Price,Rating,Reviews,Review Votes
0,199.99,5,I feel so LUCKY to have found this used phone ...,1.0
1,199.99,4,nice phone nice up grade from my pantach revue...,0.0
2,199.99,5,Very pleased,0.0
3,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,199.99,4,Great phone to replace my lost phone The only ...,0.0


In [6]:
#stopwords removal
best = cleaned_data.loc[cleaned_data['Rating'] == 5]['Reviews'].str.cat(sep=' ')
worst = cleaned_data.loc[cleaned_data['Rating'] == 1]['Reviews'].str.cat(sep=' ')
bestlist = Counter(best.split()).most_common()
worstlist = Counter(worst.split()).most_common()
def stop_words(i):
    common_words = []
    num = i
    while len(common_words) < num:
        common_words = []
        b = [x[0] for x in bestlist][:i]
        w = [x[0] for x in worstlist][:i]
        for x in range(i): 
            for y in range(i): 
                if b[x] == w[y]: 
                    common_words.append(b[x])
        i += 1
    return common_words
common_words = stop_words(13)


In [57]:
common_words

['the',
 'I',
 'and',
 'phone',
 'a',
 'it',
 'to',
 'is',
 'for',
 'this',
 'of',
 'in',
 'was',
 'that']

In [7]:
removal = cleaned_data

for i, row in removal.iterrows():
    templist = row['Reviews'].split(" ")
    resultwords  = [word for word in templist if word not in common_words]
    str1 = ' '.join(resultwords)
    cleaned_data.at[i,'Reviews'] = str1
    

In [8]:
dataset = {}
dataset['ratings'] = cleaned_data['Rating']
dataset['reviews'] = cleaned_data['Reviews']
df = pd.DataFrame(dataset)
print(df.head())

   ratings                                            reviews
0        5  feel so LUCKY have found used us  not used har...
1        4  nice nice up grade from my pantach revue Very ...
2        5                                       Very pleased
3        4  It works good but goes slow sometimes but its ...
4        4  Great replace my lost The only thing volume up...


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.reviews, df.ratings, test_size=0.3, random_state=1)

ratings = [1,2,3,4,5]


In [10]:
#df['predictions'] = np.empty((len(df), 0)).tolist()

LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

LogReg_pipeline.fit(df.reviews, df.ratings)
prediction = LogReg_pipeline.predict_proba(df.reviews)
predicted = []
for item in prediction:
    maximum = np.argmax(item) + 1
    predicted.append(maximum)
df['predictions'] = np.array(predicted)

acs = accuracy_score(df.ratings,df.predictions,normalize=True)




#df.to_csv('prediction.csv', sep='\t', encoding='utf-8')

In [11]:
df['predictions'] = np.array(predicted)

In [12]:
df

Unnamed: 0,ratings,reviews,predictions
0,5,feel so LUCKY to have found this used to us n...,5
1,4,nice nice up grade from my pantach revue Very ...,5
2,5,Very pleased,5
3,4,It works good but it goes slow sometimes but i...,4
4,4,Great to replace my lost The only thing is vol...,4
5,1,already had a with problems know it stated it ...,1
6,2,The charging port was loose got that soldered ...,1
7,2,Phone looks good but wouldnt stay charged had ...,1
8,5,originally was using Samsung S2 Galaxy for Spr...,5
9,3,Its battery life is great Its very responsive ...,4


In [11]:
acs

0.7772266093878325

In [13]:
wordarray = []
for i, row in cleaned_data.iterrows():
    templist = row['Reviews'].split(" ")
    resultwords2  = [word for word in templist if word not in common_words]
    wordarray.append(resultwords2)
wordarray

[['feel',
  'so',
  'LUCKY',
  'have',
  'found',
  'used',
  'us',
  '',
  'not',
  'used',
  'hard',
  'at',
  'all',
  'on',
  'line',
  'from',
  'someone',
  'who',
  'upgraded',
  'sold',
  'one',
  'My',
  'Son',
  'liked',
  'his',
  'old',
  'one',
  'finally',
  'fell',
  'apart',
  'after',
  '25',
  'years',
  'didnt',
  'want',
  'an',
  'upgrade',
  'Thank',
  'you',
  'Seller',
  'we',
  'really',
  'appreciate',
  '',
  'your',
  'honesty',
  're',
  'said',
  'used',
  'phoneI',
  'recommend',
  'seller',
  'very',
  'highly',
  '',
  'would',
  'but',
  'from',
  'them',
  'again'],
 ['nice',
  'nice',
  'up',
  'grade',
  'from',
  'my',
  'pantach',
  'revue',
  'Very',
  'clean',
  'set',
  'up',
  'easy',
  'set',
  'up',
  'never',
  'had',
  'an',
  'android',
  'but',
  'they',
  'are',
  'fantastic',
  'say',
  'least',
  'perfect',
  'size',
  'surfing',
  'social',
  'media',
  'great',
  'samsung'],
 ['Very', 'pleased'],
 ['It',
  'works',
  'good',
  'but'

In [25]:
wordarray =np.array(wordarray)
flatwa = wordarray.tolist()
flat_list = [item for sublist in flatwa for item in sublist]# flatten
flat_list = list(set(flat_list))#remove duplicates

flat_list

['',
 'BLUETOOTH',
 'etcCompared',
 'performanceSD',
 'loyal',
 'shortMust',
 'Bofore',
 'mill',
 'outProsCheapStylishSound',
 'PHONEmy',
 'hour4',
 'lockedDont',
 'featureHeres',
 'httpswwwyoutubecomwatchvcYYi4YHtPQYYou',
 'fettuccine',
 'Simm',
 'carriesThis',
 'Sumintra',
 'Boostphone',
 'JcPenny',
 'choppyBut',
 'regulate',
 'rodeo',
 'X2Save',
 'moneyDo',
 'grease',
 'itOrder',
 'browserI',
 'iPhoneHARDWARE',
 'displeased',
 'eitheryet',
 'phoneAnd',
 'zio',
 'devicesProxy',
 'sizeSony',
 'comer',
 'PEROOOO',
 'purcahseget',
 'anythinh',
 'VENEZUELA',
 'Joyed',
 'strucks',
 'CANV',
 'chargekept',
 'foldersyoure',
 'cluewhat',
 'megapxls',
 'nowApple',
 'PURCHESS',
 'coolspeaker',
 '418',
 'Properties4',
 'itcheap',
 'traslado',
 'timeBut',
 'phonesIt',
 'supone',
 'reallyreallyreally',
 'sendend',
 'everWaterproofing',
 'spectacular',
 'teenaged',
 'amerito',
 'heak',
 'Flows',
 'canVideo',
 'SOMTHINGOR',
 'needes',
 'centrale',
 'situation',
 'Octa',
 'cum',
 'quality5',
 'phoneP

In [26]:
#make into df
flat_list_df = np.array(flat_list)
dataset2 = {}
dataset2['word'] = flat_list_df
df2 = pd.DataFrame(dataset2)

In [38]:
prediction2 = LogReg_pipeline.predict_proba(df2.word)
predicted2 = []
for item in prediction2:
    maximum = np.argmax(item) + 1
    predicted2.append(maximum)
df2['predictions'] = np.array(predicted2)
df2

Unnamed: 0,word,predictions
0,,5
1,BLUETOOTH,5
2,etcCompared,5
3,performanceSD,5
4,loyal,5
5,shortMust,5
6,Bofore,5
7,mill,5
8,outProsCheapStylishSound,5
9,PHONEmy,5


In [40]:
groupeddf2 = df2.groupby('predictions')
groupeddf2.describe()

Unnamed: 0_level_0,word,word,word,word
Unnamed: 0_level_1,count,unique,top,freq
predictions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,4486,4486,Rim,1
2,477,477,LENTO,1
3,943,943,ite,1
4,2883,2883,covers,1
5,145330,145330,experienceFINGERPRINT,1


In [55]:
one_w = df2.loc[df2['predictions']==1]
two_w = df2.loc[df2['predictions']==2]
three_w = df2.loc[df2['predictions']==3]
four_w = df2.loc[df2['predictions']==4]
five_w = df2.loc[df2['predictions']==5]

In [56]:
one_w

Unnamed: 0,word,predictions
29,displeased,1
69,situation,1
108,suffered,1
181,aftermarketed,1
191,CHEAPLY,1
199,BADIm,1
269,Greatly,1
352,Hell,1
367,REFURBISHED,1
408,warrantee,1


In [58]:
five_w

Unnamed: 0,word,predictions
0,,5
1,BLUETOOTH,5
2,etcCompared,5
3,performanceSD,5
4,loyal,5
5,shortMust,5
6,Bofore,5
7,mill,5
8,outProsCheapStylishSound,5
9,PHONEmy,5


In [None]:
#this model is trained on sentences, with 13 stopwords removed as previously and then predicts each word's
#individual rating. the lists at the end one_w are all words with rating = 1, five_w are all words with rating = 5 and 
#so on so forth for the rest.

In [59]:
def predict(word):
    dataset3 = {}
    dataset3['word'] = np.array([word])
    df3 = pd.DataFrame(dataset3)
    prediction = LogReg_pipeline.predict_proba(df3.word)
    p = np.argmax(prediction) + 1
    return p   

In [60]:
print(predict("this is a terrible phone"))

1


In [61]:
print(predict("I like this phone a lot"))

5


In [64]:
print(predict("this is okay. It's decent"))

3
