## CUSTOMER REVIEW CLASSIFICATION

In [2]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [3]:
# loading the data
data  = pd.read_csv("Restaurant_Reviews.tsv",sep="\t")

In [4]:
# previewing the data
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# Checking the shape of the dataset
data.shape

(1000, 2)

In [6]:
# Checking for null values
data.isnull().sum().T

Review    0
Liked     0
dtype: int64

In [7]:
# Checking for class imbalance
data['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [8]:
# Assigning number of characters to new column 
data['Character_Count'] = data['Review'].apply(len)

In [10]:
# Assigning new column word count
data["word_count"] = data['Review'].apply(lambda x : len(str(x).split()))

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\espym.LAPTOP-
[nltk_data]     41F90NSA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
# assigning new column the number of sentences
data['sentence_count'] = data['Review'].apply(lambda x : len(nltk.sent_tokenize(str(x))))

In [18]:
# Checking for the mean number of characters of positive reviews
data[data['Liked'] == 1]['Character_Count'].mean()

55.88

In [19]:
# Checking for the mean number of characters of negative reviews
data[data['Liked'] == 0]['Character_Count'].mean()

60.75

## Text Cleaning

In [20]:
import re

In [24]:
data["Review"][0]

'Wow... Loved this place.'

We can see that some of the reviews contain symbols. We shall go ahead and remove non alphabetical values.

In [81]:
review = re.sub("[^a-zA-Z]", ' ', data['Review'][1])
review

'Crust is not good '

In [82]:
review =  review.lower()

In [83]:
review = review.split()

In [84]:
review

['crust', 'is', 'not', 'good']

In [85]:
from nltk.corpus import stopwords

In [86]:
all_stop_words = stopwords.words("english")
all_stop_words.remove("not")

In [87]:
review = [word for word in review if word not in set(all_stop_words)]
review

['crust', 'not', 'good']

## Stemming

In [54]:
from nltk.stem.porter import PorterStemmer

In [55]:
ps = PorterStemmer()

In [56]:
review = [ps.stem(word) for word in review]

In [57]:
review

['crust', 'good']