In [32]:
# importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
import pickle

In [2]:
# loading the data 
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Exploratory data analysis

In [3]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
# total number of rows and columns  
data.shape

(1000, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
data.columns

Index(['Review', 'Liked'], dtype='object')

In [7]:
zero = data['Liked'].value_counts().get(0)
one = data['Liked'].value_counts().get(1)

In [8]:
print(zero)
print(one)

500
500


## Data preprocessing

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bgaut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
# applying stemming 
ps = PorterStemmer()

In [12]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
data['Review'] = data['Review'].apply(stemming)

In [14]:
data.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [15]:
# splitting the data and labels
X = data['Review'].values
Y = data['Liked'].values

In [16]:
print(X)

['wow love place' 'crust good' 'tasti textur nasti'
 'stop late may bank holiday rick steve recommend love'
 'select menu great price' 'get angri want damn pho' 'honeslti tast fresh'
 'potato like rubber could tell made ahead time kept warmer' 'fri great'
 'great touch' 'servic prompt' 'would go back'
 'cashier care ever say still end wayyy overpr'
 'tri cape cod ravoli chicken cranberri mmmm'
 'disgust pretti sure human hair' 'shock sign indic cash'
 'highli recommend' 'waitress littl slow servic'
 'place worth time let alon vega' 'like' 'burritto blah' 'food amaz'
 'servic also cute' 'could care less interior beauti' 'perform'
 'right red velvet cake ohhh stuff good' 'never brought salad ask'
 'hole wall great mexican street taco friendli staff'
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm'
 'worst salmon sashimi' 'also combo like burger fri beer decent deal'
 'like final blow' 'found place accid could happier'
 'seem like good quick place gr

In [17]:
print(Y)

[1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1
 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1
 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0
 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1
 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1
 0 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1
 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1
 1 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0
 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0
 1 0 0 1 1 1 0 0 1 1 1 0 

In [18]:
# converting the text data into numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [20]:
print(X)

  (0, 1028)	0.37891311005388717
  (0, 802)	0.5161133452017592
  (0, 1547)	0.7681483384958535
  (1, 594)	0.44530826402032975
  (1, 324)	0.895377322694293
  (2, 900)	0.6095408468238828
  (2, 1374)	0.6095408468238828
  (2, 1362)	0.5068726783980841
  (3, 1115)	0.25693127784313774
  (3, 1304)	0.37344829688459497
  (3, 1149)	0.37344829688459497
  (3, 667)	0.37344829688459497
  (3, 92)	0.37344829688459497
  (3, 834)	0.31658892883770084
  (3, 762)	0.3524631959077566
  (3, 1311)	0.31658892883770084
  (3, 802)	0.22681356327865146
  (4, 1062)	0.49789374877517795
  (4, 608)	0.3895279157461649
  (4, 852)	0.5294243460432808
  (4, 1205)	0.5657558480490441
  (5, 1016)	0.45159844554308565
  (5, 333)	0.4770813062326693
  (5, 1497)	0.38439817982521884
  (5, 33)	0.546476954759213
  :	:
  (997, 703)	0.5274480716124954
  (997, 964)	0.5373607577988255
  (997, 83)	0.35731793309667187
  (997, 588)	0.3542827182320424
  (997, 1545)	0.42408678118961624
  (998, 916)	0.43090862952756726
  (998, 1523)	0.406694672259

In [21]:
# splitting the training data and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

## Model Training

### Logistic Regression

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train, Y_train)

## Model Evaluation

In [24]:
Y_pred = model.predict(X_test)

In [25]:
conf_matrix = metrics.confusion_matrix(Y_test, Y_pred)

In [31]:
print(conf_matrix)

[[91 17]
 [18 74]]


In [30]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       108
           1       0.81      0.80      0.81        92

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200



In [33]:
# saving the trained model into a pkl file
pickle.dump(model, open("model.pkl", 'wb'))