## Imports

In [14]:
import nltk
import numpy as np
import pandas as pd

In [16]:
# nltk.download('all')

In [20]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [29]:
import re

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## Read data

In [17]:
dataset = pd.read_csv('https://raw.githubusercontent.com/futurexskill/ml-model-deployment/main/Restaurant_Reviews.tsv.txt', delimiter="\t", quoting=3)

In [18]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


## Text preproccessing

In [21]:
ps = PorterStemmer()

In [35]:
corpus = []
for i in range(0,1000):
  customer_review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i].lower())
  customer_review = customer_review.split()
  clean_review = [ps.stem(word) for word in customer_review if not word in set(stopwords.words("english"))]
  clean_review = ' '.join(clean_review)
  corpus.append(clean_review)

In [34]:
re.sub('[^a-zA-Z]', ' ', dataset['Review'][0].lower()).split()

['wow', 'loved', 'this', 'place']

In [46]:
i=12
dataset['Review'][i].lower() , corpus[i]

("honeslty it didn't taste that fresh.)", 'honeslti tast fresh')

In [48]:
vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.6)

In [49]:
X = vectorizer.fit_transform(corpus).toarray()

In [54]:
y = dataset.iloc[:, 1].values

## Modelling

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [59]:
cl_knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
cl_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [60]:
y_pred_knn = cl_knn.predict(X_test)

In [65]:
cmknn = confusion_matrix(y_test, y_pred_knn)
cmknn

array([[87, 10],
       [74, 29]])

In [67]:
print(f'KNN accuracy: {accuracy_score(y_test, y_pred_knn)}')

KNN accuracy: 0.58


## Check model on samples

In [94]:
sample = "Good batting by England"
sample = sample.lower().split()
sample = [ps.stem(word) for word in sample if not word in set(stopwords.words("english"))]
sample = " ".join(sample)
print(sample)
sample = vectorizer.transform([sample]).toarray()
sample

good bat england


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [95]:
sentiment = cl_knn.predict(sample)
sentiment

array([1])

In [97]:
sample2 = 'bad perfomance by India in the match'
sample = sample2.lower().split()
sample = [ps.stem(word) for word in sample if not word in set(stopwords.words("english"))]
sample = " ".join(sample)
print(sample)
sample = vectorizer.transform([sample]).toarray()
sample

bad perfom india match


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [98]:
sentiment = cl_knn.predict(sample)
sentiment

array([0])

## Save model

In [99]:
import pickle

In [100]:
with open('classifier.pickle', 'wb') as file:
  pickle.dump(cl_knn, file)

In [101]:
with open('tfidfmodel.pickle', 'wb') as file:
  pickle.dump(vectorizer, file)