In [17]:
#Importing Libraries
import numpy as np
import pandas as pd
from matplotlib import style

#Importing necessary libraries for Natural Language Processing on the dataset
import nltk  #Natural Language Toolkit
import re  #For regular expressions
nltk.download('stopwords')
from nltk.corpus import stopwords  #stopwords are those which are meant to be ignored
from nltk.stem.porter import PorterStemmer  #Data Mining

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [30]:
data['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [32]:
#Now we'll clean the reviews
corpus=[]
for i in range(1000):

  #Cleaning special characters
  review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=data['Review'][i])

  #Converting in lower case
  review = review.lower()

  #Converting the review into set of tokens
  review_words = review.split()

  #Removing the stop words
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]

  #Stemming the words
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review_words]

  #Joining the stemmed words
  review = ' '.join(review)

  #Creating a corpus for our use
  corpus.append(review)

In [107]:
#Creating a bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 400)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:,1].values


In [108]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.30)

In [109]:
from sklearn.svm import LinearSVC

In [110]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_test,Y_test)



In [111]:
svc_pred = SVCmodel.predict(X_test)
svc_acc = accuracy_score(svc_pred,Y_test)
print("Test Accuracy: {}%".format(round(svc_acc*100,2)))

Test Accuracy: 95.67%


In [112]:
score1 = accuracy_score(Y_test,svc_pred)
score2 = precision_score(Y_test,svc_pred)
score3 = recall_score(Y_test,svc_pred)

print("-----SCORES-----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}%".format(round(score2*100,2)))
print("Recall score is: {}%".format(round(score3*100,2)))

-----SCORES-----
Accuracy score is: 95.67%
Precision score is: 97.22%
Recall score is: 93.96%


In [113]:
print(confusion_matrix(Y_test,svc_pred))
print("\n")
print(classification_report(Y_test,svc_pred))

[[147   4]
 [  9 140]]


              precision    recall  f1-score   support

           0       0.94      0.97      0.96       151
           1       0.97      0.94      0.96       149

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300

