In [1]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv(r'D:\PYTHON_OVERALL\jupyter files\Restaurant_Reviews.txt' , sep = '\t' , quoting = 3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
data.isnull().sum()

Review    0
Liked     0
dtype: int64

In [9]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [31]:
sp = stopwords.words('english')
len(sp)

179

In [41]:
def sub_split_clean(line):
    line = re.sub('[^a-zA-Z]' , ' ' , line)
    line = line.lower()
    line = line.split()
    line = [word for word in line if word not in sp]
    line = [ps.stem(word) for word in line]
    return ' '.join(line)

In [42]:
data['Review'].apply(sub_split_clean)

0                                         wow love place
1                                             crust good
2                                     tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                         overal impress would go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

In [43]:
data['clean_review'] = data['Review'].apply(sub_split_clean)
data.head()

Unnamed: 0,Review,Liked,clean_review
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust good
2,Not tasty and the texture was just nasty.,0,tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price


In [12]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer

In [44]:
cv = CountVectorizer(stop_words = 'english')
tfidf = TfidfTransformer()

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
x = data['clean_review']
y = data['Liked']

In [59]:
X_Train,X_Test,Y_Train,Y_Test = train_test_split(x,y,test_size = 0.2 , random_state = 42)

In [60]:
cv_train = cv.fit_transform(X_Train)
tfidf_train = tfidf.fit_transform(cv_train)

In [61]:
cv_test = cv.transform(X_Test)
tfidf_test = tfidf.transform(cv_test)

In [86]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [88]:
svc = SVC()
lr = LogisticRegression()
knn = KNeighborsClassifier(5)
mnb = MultinomialNB()
gnb = GaussianNB()

In [89]:
tfidf_train.shape

(800, 1297)

In [90]:
Y_Train.shape, X_Train.shape

((800,), (800,))

In [99]:
models = [('svc',svc)  , ('lr',lr), ('knn',knn), ('mnb',mnb) , ('gnb',gnb)]

results = dict()

for model in models:
    name ,model = model
    model.fit(tfidf_train.toarray() , Y_Train)
    y_pred = model.predict(tfidf_test.toarray())
    acc = accuracy_score(Y_Test , y_pred)
    cm = confusion_matrix(Y_Test ,  y_pred)
    results[name] = {'accuracy':acc, 'confusion_matrix':cm}
    
results

{'svc': {'accuracy': 0.77,
  'confusion_matrix': array([[82, 14],
         [32, 72]], dtype=int64)},
 'lr': {'accuracy': 0.72,
  'confusion_matrix': array([[77, 19],
         [37, 67]], dtype=int64)},
 'knn': {'accuracy': 0.525,
  'confusion_matrix': array([[95,  1],
         [94, 10]], dtype=int64)},
 'mnb': {'accuracy': 0.745,
  'confusion_matrix': array([[69, 27],
         [24, 80]], dtype=int64)},
 'gnb': {'accuracy': 0.71,
  'confusion_matrix': array([[57, 39],
         [19, 85]], dtype=int64)}}

In [78]:
results.keys()

dict_keys(['svc', 'lr', 'knn', 'mnb'])

In [101]:
print(results['svc']['accuracy'])
results['svc']['confusion_matrix']

0.77


array([[82, 14],
       [32, 72]], dtype=int64)

In [102]:
print(results['lr']['accuracy'])
results['lr']['confusion_matrix']

0.72


array([[77, 19],
       [37, 67]], dtype=int64)

In [103]:
print(results['knn']['accuracy'])
results['knn']['confusion_matrix']

0.525


array([[95,  1],
       [94, 10]], dtype=int64)

In [104]:
print(results['mnb']['accuracy'])
results['mnb']['confusion_matrix']

0.745


array([[69, 27],
       [24, 80]], dtype=int64)

In [105]:
print(results['gnb']['accuracy'])
results['gnb']['confusion_matrix']

0.71


array([[57, 39],
       [19, 85]], dtype=int64)