# Restaurant Reviews (Natural Language Processing)

## Data Preprocessing

### Importing the Libraries

In [None]:
import numpy as np #type: ignore
import pandas as pd #type: ignore

### Importing the Dataset

In [None]:
dataset = pd.read_csv('data/data.tsv', delimiter = '\t', quoting = 3)

### Cleaning the Texts

In [None]:
import re
import nltk #type: ignore
nltk.download('stopwords')
from nltk.corpus import stopwords #type: ignore
from nltk.stem.porter import  PorterStemmer #type: ignore
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #type: ignore
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

### Building the Bags of Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #type: ignore
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray() #type: ignore
y = dataset.iloc[:, -1].values

### Splitting the Dataset into the Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split #type: ignore
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Naive Bayes model on the Training Set

In [None]:
from sklearn.naive_bayes import GaussianNB #type: ignore
classifier = GaussianNB()
classifier.fit(X=X_train, y=y_train)

## Model Evaluation

### Predicting the Test Set Results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score #type: ignore
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)