# Natural Language Processing

## Importing the libraries

In [526]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [527]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Cleaning the texts

In [528]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
negation_words = {"not", "no", "never", "nor", "none", "nothing",
                  "nobody", "neither", "nowhere", "hardly",
                  "scarcely", "barely", "without", "cannot"}
corpus = []
for i in range (0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = set(stopwords.words('english'))
  all_stopwords = list(all_stopwords - negation_words)
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [529]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier no care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock no sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid cou

##Implementing TF-IDF

In [530]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5250, ngram_range=(1, 3))
TfIdf = vectorizer.fit_transform(corpus)

##Display IDF Values

In [531]:
print('\nidf values:')
for ele1, ele2 in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
    print(ele1, ':', ele2)


idf values:
absolut : 5.6061696863211745
absolut amaz : 6.810142490647111
absolut flat : 7.215607598755275
absolut flat line : 7.215607598755275
absolut must : 7.215607598755275
absolut must visit : 7.215607598755275
absolut no : 6.810142490647111
absolut no clue : 7.215607598755275
absolut no flavor : 7.215607598755275
absolut star : 7.215607598755275
absolutley : 7.215607598755275
absolutley fantast : 7.215607598755275
accid : 7.215607598755275
accid could : 7.215607598755275
accid could not : 7.215607598755275
accommod : 7.215607598755275
accommod vegetarian : 7.215607598755275
acknowledg : 6.810142490647111
actual : 6.52246041819533
ad : 6.810142490647111
ago : 6.52246041819533
almost : 6.810142490647111
also : 4.576550269140016
also serv : 6.810142490647111
also tast : 6.810142490647111
although : 6.52246041819533
alway : 5.13616605707544
alway great : 6.810142490647111
alway hit : 6.810142490647111
amaz : 4.689878954447019
ambianc : 5.829313237635384
ambienc : 6.52246041819533
a

##Display TF-IDF Values

In [532]:
print('\nWord indexes:')
print(vectorizer.vocabulary_)
print('\ntf-idf value:')
print(TfIdf)
print('\ntf-idf values in matrix form:')
print(TfIdf.toarray())


Word indexes:
{'wow': np.int64(5225), 'love': np.int64(4134), 'place': np.int64(4893), 'love place': np.int64(4170), 'crust': np.int64(1240), 'not': np.int64(4819), 'good': np.int64(2917), 'crust not': np.int64(1241), 'not good': np.int64(4827), 'crust not good': np.int64(1242), 'tasti': np.int64(5111), 'textur': np.int64(5118), 'nasti': np.int64(4675), 'not tasti': np.int64(4840), 'stop': np.int64(5079), 'late': np.int64(3840), 'may': np.int64(4348), 'bank': np.int64(116), 'holiday': np.int64(3410), 'recommend': np.int64(4961), 'late may': np.int64(3841), 'may bank': np.int64(4349), 'bank holiday': np.int64(117), 'holiday rick': np.int64(3411), 'late may bank': np.int64(3842), 'may bank holiday': np.int64(4350), 'bank holiday rick': np.int64(118), 'holiday rick steve': np.int64(3412), 'select': np.int64(5006), 'menu': np.int64(4442), 'great': np.int64(3080), 'price': np.int64(4925), 'menu great': np.int64(4453), 'great price': np.int64(3133), 'menu great price': np.int64(4454), 'get'

##Building the dataset using TF-IDF

In [533]:
X_One = vectorizer.fit_transform(corpus).toarray()
y_One = dataset.iloc[:, -1].values

In [534]:
print(len(X_One[0]))

5250


## Creating the Bag of Words model

In [535]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1200)
X_Two = cv.fit_transform(corpus).toarray()
y_Two = dataset.iloc[:, -1].values

In [536]:
len(X_Two[0])

1200

## Splitting the dataset into the Training set and Test set

In [537]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_One, y_One, test_size=0.2, random_state=0)

## Training the Naive Bayes model on the Training set

In [538]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results using Naive Bayes

In [539]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix Using Naive Bayes

In [540]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[60 37]
 [18 85]]


0.725

In [541]:
print(precision_score(y_test, y_pred))

0.6967213114754098


In [542]:
print(recall_score(y_test, y_pred))

0.8252427184466019


In [543]:
print(f1_score(y_test, y_pred))

0.7555555555555555


In [544]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.62      0.69        97
           1       0.70      0.83      0.76       103

    accuracy                           0.72       200
   macro avg       0.73      0.72      0.72       200
weighted avg       0.73      0.72      0.72       200



In [545]:
#!pip freeze > requirements.txt

##Prediction Using Logistic Regression


In [546]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

##Prediciting the result Using Logistic Regression

In [547]:
y_pred = classifier.predict(X_test)

##Making the confusion matrix using Logistic Regression

In [548]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[86 11]
 [26 77]]


0.815

In [549]:
print(precision_score(y_test, y_pred))

0.875


In [550]:
print(recall_score(y_test, y_pred))

0.7475728155339806


In [551]:
print(f1_score(y_test, y_pred))

0.806282722513089


In [552]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82        97
           1       0.88      0.75      0.81       103

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200



##Training TF-IDF on SVM

In [553]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=0)
svm.fit(X_train, y_train)

##Predicting the new result

In [554]:
y_pred = classifier.predict(X_test)

In [555]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[86 11]
 [26 77]]


0.815

In [556]:
print(precision_score(y_test, y_pred))

0.875


In [557]:
print(recall_score(y_test, y_pred))

0.7475728155339806


In [558]:
print(f1_score(y_test, y_pred))

0.806282722513089


In [559]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82        97
           1       0.88      0.75      0.81       103

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200

