In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv',usecols = ['reviews.text','reviews.rating'])

In [4]:
df.head()

Unnamed: 0,reviews.rating,reviews.text
0,3,I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.
1,4,Bulk is always the less expensive way to go for products like these
2,5,Well they are not Duracell but for the price i am happy.
3,5,Seem to work as well as name brand batteries at a much better price
4,5,These batteries are very long lasting the price is great.


In [5]:
df['reviews.rating'].unique()

array([3, 4, 5, 1, 2])

In [6]:
def rating_class(rating):
    if rating <3:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

In [7]:
df['reviews.class'] = df.apply(lambda row: rating_class(row['reviews.rating']),axis = 1)

In [8]:
display(df.head())

Unnamed: 0,reviews.rating,reviews.text,reviews.class
0,3,I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.,Neutral
1,4,Bulk is always the less expensive way to go for products like these,Positive
2,5,Well they are not Duracell but for the price i am happy.,Positive
3,5,Seem to work as well as name brand batteries at a much better price,Positive
4,5,These batteries are very long lasting the price is great.,Positive


In [9]:
df.tail()

Unnamed: 0,reviews.rating,reviews.text,reviews.class
28327,5,I got 2 of these for my 8 yr old twins. My 11 yr old has one but this one is better. Perfect way to get them to read,Positive
28328,4,I bought this for my niece for a Christmas gift.she is 9 years old and she love it.,Positive
28329,5,"Very nice for light internet browsing, keeping on top of email, viewing videos, and reading e books, which I like to get free from the library. Good browser, good battery power, fast charge. Very relaxing to sit in a comfy chair and read or browse. Has given me hours of enjoyment and information for a minimal price.",Positive
28330,5,"This Tablet does absolutely everything I want! I can watch TV Shows or Movies, check my Mail, Facebook, Google.......pay all my bills. It processes fast and has a beautiful screen. As I said: Everything I want in a Tablet for less than $100!",Positive
28331,4,"At ninety dollars, the expectionations are low, but this is still a very good table, it's good for light use like watching videos or web browsing, but the camera is a bit lacking, and at certain times the device can freeze or lag. Overall this is a good tablet for $90, I would recommend this to anyone on a budget.",Positive


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lujain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lujain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lujain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
for item in ["no","not","don't","didn't","couldn't","hasn't","hadn't","haven't","isn't","shouldn't","mustn't","won't","wouldn't"]:
    print(item, item in stopwords.words('english'))

no True
not True
don't True
didn't True
couldn't True
hasn't True
hadn't True
haven't True
isn't True
shouldn't True
mustn't True
won't True
wouldn't True


In [12]:
def tokenize(text):
    url_regex = "https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    

    tokens = word_tokenize(text)
    
    remove_list = ["no","not","don't","didn't","couldn't","hasn't","hadn't","haven't","isn't","shouldn't","mustn't","won't","wouldn't","as"]
    custom_stop_words = [x for x in stopwords.words('english') if (x not in remove_list)]
    
    lemmatizer = WordNetLemmatizer()

    
    #This lambda function is to keep words in 'all caps only' otherwise lowercase
    custom_lower = (lambda x: x if x.isupper() else x.lower())
    
    clean_tokens = []
    for tok in tokens:
        if tok.lower() not in custom_stop_words:
            clean_tok = custom_lower(lemmatizer.lemmatize(tok).strip())
            clean_tokens.append(clean_tok)

    return clean_tokens

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

In [14]:
df = df.sample(frac=1)

In [15]:
X = df['reviews.text']
y = df['reviews.class']

In [16]:
pipeline = Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidf',TfidfTransformer()),
    ('clf',RandomForestClassifier())
])

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

In [18]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [19]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
print('f1 score:',f1_score(y_test,y_pred,average = None))
print('accuracy',accuracy_score(y_test,y_pred))

f1 score: [0.63469676 0.58148148 0.97098597]
accuracy 0.9445882352941176


In [20]:
print('Labels:',labels)
print(confusion_mat)

Labels: ['Negative' 'Neutral' 'Positive']
[[ 225    5  243]
 [   9  157  188]
 [   2   24 7647]]
