STEPS:

1) Data Cleaning

2) Exploratory Data Analysis

3) Data Preprocessing

4) Model Building

5) Model Evaluation

6) Selecting the Best ML model

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
from wordcloud import WordCloud

In [2]:
# importing natural language toolkit libraries

import re
import string
import missingno
import pandas_profiling
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import FreqDist


In [3]:
# other imports

from scipy import stats
from scipy.stats import zscore
from scipy.sparse import hstack
import scikitplot as skplt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, precision_score, confusion_matrix, accuracy_score, classification_report

In [4]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

In [5]:
# warning imports

import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
import joblib


In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()

DATA CLEANING AND EDA

In [7]:
df = pd.read_csv("Review_Rating_Datafile.csv")

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Review_title,Review_text,Ratings
0,0,Suitable for School kids,\n If you are a College student or a professi...,2.0 out of 5 stars
1,1,Misrepresentation on MS Office 2019 license - ...,\n Update after one month usage - MS Office 2...,2.0 out of 5 stars
2,2,The sold me renewed laptop,\n It’s look like renewed laptop because lapt...,2.0 out of 5 stars
3,3,Amazon dupes with specification/ battery sucks,\n &nbsp;I had seen the specifications and bo...,2.0 out of 5 stars
4,4,Display back light issue,\n Display gone with 2 months.. But anyway th...,2.0 out of 5 stars


In [9]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Review_title,Review_text,Ratings
77545,77545,Nice product,good product,4
77546,77546,Awesome,Very good as expected and happy with the purchase,5
77547,77547,Awesome,I love it! No complaint!,5
77548,77548,Nice product,good product,4
77549,77549,Awesome,Very good as expected and happy with the purchase,5


In [10]:
# deleting the column 'unnamed'

df.drop(columns = 'Unnamed: 0', inplace = True)

In [11]:
df.head(20)

Unnamed: 0,Review_title,Review_text,Ratings
0,Suitable for School kids,\n If you are a College student or a professi...,2.0 out of 5 stars
1,Misrepresentation on MS Office 2019 license - ...,\n Update after one month usage - MS Office 2...,2.0 out of 5 stars
2,The sold me renewed laptop,\n It’s look like renewed laptop because lapt...,2.0 out of 5 stars
3,Amazon dupes with specification/ battery sucks,\n &nbsp;I had seen the specifications and bo...,2.0 out of 5 stars
4,Display back light issue,\n Display gone with 2 months.. But anyway th...,2.0 out of 5 stars
5,Battery life,\n Battery backup give only one hour . What d...,2.0 out of 5 stars
6,No for professional works,\n Blue screen while power on the linux Opera...,2.0 out of 5 stars
7,monitor power issue,\n monitor is not turning on from 2 weeks of ...,2.0 out of 5 stars
8,waste,\n waste of money sometimes screen was marks\n,2.0 out of 5 stars
9,Not able to download warranty card,\n Good\n,2.0 out of 5 stars


In [12]:
# more information about our data

df.shape

(77550, 3)

we have 77550 rows and 3 columns. More data results in better results.

we can see we have some missing data in our datasets, we will have to treat that.

In [13]:
# checking for missing values

df.isna().sum()

Review_title    9029
Review_text     8085
Ratings         9027
dtype: int64

In [14]:
#visual representation of missing values in the dataset

missingno.matrix(df, figsize = (15,8), color=(0.25, 0.75, 0.25), fontsize=18)

<AxesSubplot:>

In [15]:
# removing rows with missing data

df.dropna(inplace=True)

In [16]:
df.shape

(68294, 3)

earlier we had 77550 rows now after deleting the missing data we have 68294 rows

In [17]:
# checking to see if there are more null values

df.isnull().sum()

Review_title    0
Review_text     0
Ratings         0
dtype: int64

In [18]:
df['Review_title'].unique()

array(['Suitable for School kids',
       'Misrepresentation on MS Office 2019 license - it is one month trial',
       'The sold me renewed laptop', ..., 'sufficient for gaming',
       'Design looks very good',
       'Great Monitor in this Price Range, with all Features.'],
      dtype=object)

In [19]:
df['Ratings'].unique()

array(['2.0 out of 5 stars', '3.0 out of 5 stars', '1.0 out of 5 stars',
       '5.0 out of 5 stars', '4.0 out of 5 stars', '5', '1', '4', '3',
       '2'], dtype=object)

In [20]:
df['Ratings'] = df['Ratings'].replace('1.0 out of 5 stars',1)
df['Ratings'] = df['Ratings'].replace('2.0 out of 5 stars',2)
df['Ratings'] = df['Ratings'].replace('3.0 out of 5 stars',3)
df['Ratings'] = df['Ratings'].replace('4.0 out of 5 stars',4)
df['Ratings'] = df['Ratings'].replace('5.0 out of 5 stars',5)
df['Ratings'] = df['Ratings'].astype('int')
df['Ratings'].unique()

array([2, 3, 1, 5, 4])

In [21]:
# Now combining the "Review_title" and "Review_text" columns into one single column called "Review"

df['Review'] = df['Review_title'].map(str)+' '+df['Review_text']
df

Unnamed: 0,Review_title,Review_text,Ratings,Review
0,Suitable for School kids,\n If you are a College student or a professi...,2,Suitable for School kids \n If you are a Coll...
1,Misrepresentation on MS Office 2019 license - ...,\n Update after one month usage - MS Office 2...,2,Misrepresentation on MS Office 2019 license - ...
2,The sold me renewed laptop,\n It’s look like renewed laptop because lapt...,2,The sold me renewed laptop \n It’s look like ...
3,Amazon dupes with specification/ battery sucks,\n &nbsp;I had seen the specifications and bo...,2,Amazon dupes with specification/ battery sucks...
4,Display back light issue,\n Display gone with 2 months.. But anyway th...,2,Display back light issue \n Display gone with...
...,...,...,...,...
77545,Nice product,good product,4,Nice product good product
77546,Awesome,Very good as expected and happy with the purchase,5,Awesome Very good as expected and happy with t...
77547,Awesome,I love it! No complaint!,5,Awesome I love it! No complaint!
77548,Nice product,good product,4,Nice product good product


In [22]:
df['Review'][0]


'Suitable for School kids \n  If you are a College student or a professional who depends heavily on laptop for pretty much everyday then  this laptop is not for you. It hangs more often than it runs. Cannot install essential computer science software such as Eclipse or android studio because then this laptop just dies.<br><br>I took this laptop thinking that it will have good performance based on its configuration.<br><br>But if you are someone who wants to attend online classes or just browse, then you may go for this laptop.\n'

In [23]:
df['Review'][1]


'Misrepresentation on MS Office 2019 license - it is one month trial \n  Update after one month usage - MS Office 2019 preinstalled edition became unlicensed version within 30 days and refused to activate. HP support shared their product sheet that stated that this laptop came only with a trial version of MS Office 2019. Now I am chasing seller Appario Retail to honour the commitment.<br><br>Bad experience after paying high price (around ₹67,000) for preinstalled genuine software.<br><br>Other wise product has sleek looks, superfast fingerprint scanner, very good display, very responsive touchpad, decent (but tinny) speakers, USB C and barrel plug charging options and under 10 second boot up (SSD drive). Preloaded with Win10 Home and MS Office (not sure whether trial or full version). Drawbacks - weighs 1.42 kg, base gets warm rather quickly and so does the power adapter. Decent for a college student.\n'

In [24]:
df['Review'][2]


'The sold me renewed laptop \n  It’s look like renewed laptop because laptop charging jack is loose and left side speaker sounds like it’s tear. Bad sound quality  and finger sensor quality look like 2011 touch phone of wing and any Chinese phone touch. Waste of money and one more thing day by day Amazon product quality getting poor\n'

In [25]:
def decontracted(text):
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don’t", "do not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"im ", "i am", text)
    text = re.sub(r"yo ", "you ",text)
    text = re.sub(r"doesn’t", "does not",text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"<br>", " ", text)
    text = re.sub(r'http\S+', '', text) #removing urls
    return text

# Lowercasing the alphabets
df['Review'] = df['Review'].apply(lambda x : x.lower())
df['Review'] = df['Review'].apply(lambda x : decontracted(x))

# Removing punctuations from the review
df['Review'] = df['Review'].str.replace('[^\w\s]','')
df['Review'] = df['Review'].str.replace('\n',' ')

In [26]:
# Removing all the stopwords

stop = stopwords.words('english')
df['Review'] = df['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [27]:
df['Review'][0]


'suitable school kids college student professional depends heavily laptop pretty much everyday laptop hangs often runs cannot install essential computer science software eclipse android studio laptop dies took laptop thinking good performance based configuration someone wants attend online classes browse may go laptop'

In [28]:
# Defining functiom to convert nltk tag to wordnet tags
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
# Defining function to lemmatize our text
def lemmatize_sentence(sentence):
    # tokenize the sentence and find the pos_tag
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    # tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x : (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatize_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatize_sentence.append(word)
        else:
            lemmatize_sentence.append(lemmatizer.lemmatize(word,tag))
    return " ".join(lemmatize_sentence)    

df['Review'] = df['Review'].apply(lambda x : lemmatize_sentence(x))

In [29]:
df['Review'][0]

'suitable school kid college student professional depend heavily laptop pretty much everyday laptop hang often run can not install essential computer science software eclipse android studio laptop dy take laptop thinking good performance base configuration someone want attend online class browse may go laptop'

In [30]:
# Noise removal function
def scrub_words(text):
    # remove HTML markup
    text = re.sub("(<.*?>)", "", text)
    # remove non-ascii and digits
    text = re.sub("(\\W)", " ", text)
    text = re.sub("(\\d)", "", text)
    # remove white space
    text = text.strip()
    return text

df['Review'] = df['Review'].apply(lambda x : scrub_words(x))

In [31]:
df['Review'][0]

'suitable school kid college student professional depend heavily laptop pretty much everyday laptop hang often run can not install essential computer science software eclipse android studio laptop dy take laptop thinking good performance base configuration someone want attend online class browse may go laptop'

In [32]:
# Creating column for word counts in the review text

df['Review_WC'] = df['Review'].apply(lambda x: len(str(x).split(' ')))
df[['Review_WC', 'Review']].head(10)

Unnamed: 0,Review_WC,Review
0,44,suitable school kid college student profession...
1,103,misrepresentation m office license one month ...
2,43,sell renew laptop look like renew laptop lapto...
3,21,amazon dupe specification battery suck nbspi s...
4,16,display back light issue display go month any...
5,7,battery life battery backup give one hour
6,18,professional work blue screen power linux oper...
7,8,monitor power issue monitor turn week purchase
8,6,waste waste money sometimes screen mark
9,5,able download warranty card good


In [33]:
# Density plot and histogram of all word count
plt.figure(figsize=(10,7))
sns.distplot(df['Review_WC'], hist = True, kde = True,
            bins = int(180/5), color = 'darkblue',
            hist_kws = {'edgecolor':'black'},
            kde_kws = {'linewidth':4})
plt.show()

In [34]:
# Creating column for character counts in the review text
df['Review_CC'] = df['Review'].str.len()
df[['Review_CC','Review']].head(10)

Unnamed: 0,Review_CC,Review
0,309,suitable school kid college student profession...
1,654,misrepresentation m office license one month ...
2,250,sell renew laptop look like renew laptop lapto...
3,126,amazon dupe specification battery suck nbspi s...
4,102,display back light issue display go month any...
5,41,battery life battery backup give one hour
6,125,professional work blue screen power linux oper...
7,47,monitor power issue monitor turn week purchase
8,39,waste waste money sometimes screen mark
9,32,able download warranty card good


In [35]:
# Density plot and histogram of all character count
plt.figure(figsize=(10,7))
sns.distplot(df['Review_CC'], hist = True, kde = True,
            bins = int(180/5), color = 'darkblue',
            hist_kws = {'edgecolor':'black'},
            kde_kws = {'linewidth':4})
plt.show()