# Problem

### This is a list of over 34,000 consumer reviews for Amazon products like the Kindle, Fire TV Stick, and more provided by Datafiniti's Product Database. The dataset includes basic product information, rating, review text, and more for each product.

#### Setup Library

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import re, string
import warnings
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

warnings.filterwarnings("ignore")
%matplotlib inline

[nltk_data] Downloading package stopwords to C:\Users\Mohamed
[nltk_data]     Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mohamed
[nltk_data]     Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mohamed Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Mohamed
[nltk_data]     Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Read Dataset

In [2]:
dataset = pd.read_csv('1429_1.csv')

### Data Exploration

In [3]:
print("Number of features: {}".format(dataset.shape[1]))
print("Number of samples: {}".format(dataset.shape[0]))

Number of features: 21
Number of samples: 34660


In [4]:
dataset.head(3)

Unnamed: 0,id,name,asins,brand,categories,keys,manufacturer,reviews.date,reviews.dateAdded,reviews.dateSeen,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username
0,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,This product so far has not disappointed. My c...,Kindle,,,Adapter
1,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,great for beginner or experienced person. Boug...,very fast,,,truman
2,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,,,DaveZ


In [5]:
dataset.columns

Index(['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer',
       'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username'],
      dtype='object')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs 

### From above dataset we choose two columns 'reviews.text' , 'reviews.rating' for NLP job

In [7]:
df = dataset[['reviews.text','reviews.rating']]

In [8]:
df.head()

Unnamed: 0,reviews.text,reviews.rating
0,This product so far has not disappointed. My c...,5.0
1,great for beginner or experienced person. Boug...,5.0
2,Inexpensive tablet for him to use and learn on...,5.0
3,I've had my Fire HD 8 two weeks now and I love...,4.0
4,I bought this for my grand daughter when she c...,5.0


#### Dealing with missing values

In [9]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Unnamed: 0,Total,Percent
reviews.rating,33,0.000952
reviews.text,1,2.9e-05


In [10]:
df.dropna(inplace=True)

In [11]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Unnamed: 0,Total,Percent
reviews.rating,0,0.0
reviews.text,0,0.0


### Data Preprocessing

### Data Clean

In [12]:
sample = df.iloc[0,0]

##### Lowercase

In [13]:
sample = str(sample).lower()
sample

'this product so far has not disappointed. my children love to use it and i like the ability to monitor control what content they see with ease.'

##### Strip white space

In [14]:
sample = str(sample).strip()
sample

'this product so far has not disappointed. my children love to use it and i like the ability to monitor control what content they see with ease.'

##### Remove HTML tags/markups

In [15]:
sample = re.compile('<.*?>').sub('', str(sample))
sample

'this product so far has not disappointed. my children love to use it and i like the ability to monitor control what content they see with ease.'

##### remove punctuation

In [16]:
sample = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', str(sample))
sample

'this product so far has not disappointed  my children love to use it and i like the ability to monitor control what content they see with ease '

##### Remove extra space and tabs

In [17]:
sample = re.sub('\s+', ' ', str(sample))
sample

'this product so far has not disappointed my children love to use it and i like the ability to monitor control what content they see with ease '

#### Stop word removal

In [18]:
filtered_sentence = []
# Stop word lists can be adjusted for your problem
stop_words = set(stopwords.words('english'))


# Tokenize the sentence
words = word_tokenize(sample)
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
sample = " ".join(filtered_sentence)

sample

'product far disappointed children love use like ability monitor control content see ease'

#### Stemming

In [19]:
# Initialize the stemmer
snow = SnowballStemmer('english')
stemmed_sentence = []

# Tokenize the sentence
words = word_tokenize(sample)
for w in words:
    # Stem the word/token
    stemmed_sentence.append(snow.stem(w))
sample = " ".join(stemmed_sentence)
sample

'product far disappoint children love use like abil monitor control content see eas'

#### Lemmatization

In [20]:
# Initialize the lemmatizer
wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_sentence = []

# Tokenize the sentence
words = word_tokenize(sample)
# Get position tags
word_pos_tags = nltk.pos_tag(words)
# Map the position tag and lemmatize the word/token
for idx, tag in enumerate(word_pos_tags):
    lemmatized_sentence.append(wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))

sample = " ".join(lemmatized_sentence)
sample

'product far disappoint child love use like abil monitor control content see ea'

## Clean Data Preprocessing step

In [21]:
filtered_sentence = []
stemmed_sentence = []
lemmatized_sentence = []
stop_words = set(stopwords.words('english'))
snow = SnowballStemmer('english')
wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


for i in range (df.shape[0]):
    Text = str(df.iloc[i,0])
    Text = Text.lower()
    Text = Text.strip()
    Text = re.compile('<.*?>').sub('', Text)
    Text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', Text)
    Text = re.sub('\s+', ' ', Text)
    #tokenize
    words = word_tokenize(Text)
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    Text = " ".join(filtered_sentence)
    filtered_sentence.clear()
    #stemming
    words = word_tokenize(Text)
    for w in words:
        stemmed_sentence.append(snow.stem(w))
    Text = " ".join(stemmed_sentence)
    stemmed_sentence.clear()
    #Lemmatization
    words = word_tokenize(Text)
    # Get position tags
    word_pos_tags = nltk.pos_tag(words)
    # Map the position tag and lemmatize the word/token
    for idx, tag in enumerate(word_pos_tags):
        lemmatized_sentence.append(wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))
    Text = " ".join(lemmatized_sentence)
    lemmatized_sentence.clear()
    df.iloc[i,0]=Text

## Vectorizer

### Separating label and Feature

In [22]:
X = df['reviews.text']
y = df['reviews.rating']

In [23]:
# Map class 1 , 2 as Negative (-1) ---- class 3 as Neutral (0) ---- class 4 , 5 as Positive (1)
y[y==1]=0
y[y==2]=0
y[y==3]=0
y[y==4]=1
y[y==5]=1
y

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
34655    0.0
34656    0.0
34657    0.0
34658    0.0
34659    0.0
Name: reviews.rating, Length: 34626, dtype: float64

### TF - IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = X

vectorizer = TfidfVectorizer()
vectorizer.fit(texts)
features = vectorizer.transform(texts)

X_N = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names())

X_N

Unnamed: 0,00,000,000s,01,04,05,06,0ff,0ghz,0ne,...,zigbe,zinio,zip,zipper,zippi,zombi,zone,zoo,zoom,zwave
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model

In [25]:
from sklearn.model_selection import train_test_split

# Get 80% of the data for training; the remaining 20% will be for validation and test
X_train, X_test, y_train, y_test = train_test_split(X_N, y, test_size=0.2,random_state=42)

print(f"Length of train_features is: {X_train.shape}")
print(f"Length of train_labels is: {y_train.shape}")
print(f"Length of test_features is: {X_test.shape}")
print(f"Length of test_labels is: {y_test.shape}")

Length of train_features is: (27700, 9439)
Length of train_labels is: (27700,)
Length of test_features is: (6926, 9439)
Length of test_labels is: (6926,)


## XGB

In [26]:
from xgboost import XGBClassifier
classifier_XGB = XGBClassifier()
classifier_XGB.fit(X_train, y_train)

# Predicting the Test set results
y_pred_XGB = classifier_XGB.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_XGB)



array([[  79,  356],
       [  42, 6449]], dtype=int64)

## Model Result

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_XGB)

0.9425353739532197

In [28]:
from sklearn.metrics import f1_score, precision_score, recall_score

print("F1-Score : {} ".format(f1_score(y_test, y_pred_XGB, average="macro")))
print("Precision : {} ".format(precision_score(y_test, y_pred_XGB, average="macro")))
print("Recall : {} " .format(recall_score(y_test, y_pred_XGB, average="macro")))  

F1-Score : 0.6271194235946982 
Precision : 0.8002890436662395 
Recall : 0.5875693488951103 
