In [5]:
############ Importing Required Libraries #############
import pandas as pd
import numpy as np 
import nltk
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score


dataset = pd.read_csv("Restaurant_Reviews.tsv",delimiter = "\t",quoting=3)
pe = PorterStemmer()
all_stopword = stopwords.words('english')
type(all_stopword)
all_stopword.remove('not')
corpus = []

### Clean Data
for i in range(0,len(dataset)):
    review = re.sub('[^a-zA-Z]'," ",dataset["Review"][i])   #replace punctuation
    review = review.lower()                                 #convert to lowercase
    review = review.split()                                 #split reviews
    
    review = [pe.stem(word) for word in review if not word in set(all_stopword)]
                ## for every word in reviews we check if the word not in stopword if yes then 
                ## using porterstemmer we apply stemming if no then we drop the word
    review = " ".join(review)
    corpus.append(review)

#### Vectorization
cv = CountVectorizer(max_features=1500) ##1500 columns
X = cv.fit_transform(corpus).toarray()
y = dataset["Liked"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape , y_test.shape

#### Preparing Model
classifier = GaussianNB().fit(X_train, y_train)
cls = MultinomialNB().fit(X_train, y_train)

### Generating Predictions
y_pred = cls.predict(X_test)

### Generating Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
score = accuracy_score(y_test,y_pred)

### Saving Models For Future Use
pickle.dump(cv, open('cv.pkl', 'wb'))      ## Convt Vectorization
pickle.dump(cls, open("review.pkl", "wb")) ## Model

### Loading Model
loaded_model = pickle.load(open("review.pkl", "rb"))
loaded_model.predict(X_test)

[nltk_data] Downloading package stopwords to /Users/kevin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1])