# Natural Language Processing

### Importing the dataset

In [31]:
import pandas as pd

# remove all quotes
dataset = pd.read_csv("./filez/Restaurant_Reviews.tsv", sep='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Cleaning the texts

In [32]:
import re
import nltk

# download stopwords (just once)
# nltk.download('stopwords')

from nltk.corpus import stopwords

# Stemming: removing suffixes from words to reduce them to their base or root form
from nltk.stem.porter import PorterStemmer

In [33]:
corpus = []
ps = PorterStemmer()

# careful!! some stop words are important, such as 'not'
all_stop_words = stopwords.words("english")
all_stop_words.remove("not")

for i in range(0, len(dataset)):
    # 1) remove punctuations (anything which is not a letter) by space
    # -> not all letters from a to z and A to Z
    # output: Wow    Loved this place
    review = re.sub("[^a-zA-Z]", " ", dataset["Review"][i])

    # 2) only lower-case letters
    # output: wow    loved this place
    review = review.lower()

    # 3) Split elements in words (to apply stemming afterwords)
    # output: ['wow', 'loved', 'this', 'place']
    review = review.split()

    # 4) Steeming on each word of a review
    # output: ['wow', 'love', 'place']
    review = [ps.stem(word) for word in review if not word in all_stop_words]

    # 5) join words to have a string sentence
    review = " ".join(review)
    # output: wow love place

    corpus.append(review)

print(corpus[:3])

['wow love place', 'crust not good', 'not tasti textur nasti']


### Creating the Bag of Words model
tokenisation to create the sparse matrix

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X = cv.fit_transform(corpus).toarray()
y = dataset["Liked"]

"""
The result, X, is a numerical representation of the text corpus, where each review is now
represented by a vector indicating the presence and frequency of words from the corpus.
This numerical representation can then be used as input for various machine learning models,
especially for natural language processing tasks.
"""

'\nThe result, X, is a numerical representation of the text corpus, where each review is now\nrepresented by a vector indicating the presence and frequency of words from the corpus.\nThis numerical representation can then be used as input for various machine learning models,\nespecially for natural language processing tasks.\n'

In [35]:
print(len(X[0]))  # 1,566 possible distinct words

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

"""
The reason to limit the number of features (words) is often to improve computational
efficiency and possibly the performance of subsequent machine learning models. By focusing
on the most frequent words, you might capture the most relevant aspects of the text while
discarding rare words which might be less informative or just noise. 
"""

1566


'\nThe reason to limit the number of features (words) is often to improve computational\nefficiency and possibly the performance of subsequent machine learning models. By focusing\non the most frequent words, you might capture the most relevant aspects of the text while\ndiscarding rare words which might be less informative or just noise. \n'

### Splitting the dataset into the Train/Test sets

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Training the Naive Bayes model on the Training test

In [37]:
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()

# Random Forest gives better results than GaussianNB, which assumes normal dist.
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Predicting the Test set results

In [38]:
def make_prediction(review: str):
    prediction = classifier.predict(cv.transform([review]).toarray())
    print(f"Review for '{review}' -> {prediction[0]}")

make_prediction("very bad service")
make_prediction("awesome meal")
make_prediction("A great touch")
make_prediction("horrible service") 

Review for 'very bad service' -> 0
Review for 'awesome meal' -> 0
Review for 'A great touch' -> 1
Review for 'horrible service' -> 0


### Making the Confusion Matrix

In [39]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = classifier.predict(X_test)

print(f"1) classification_report:\n\n", classification_report(y_test, y_pred))
print(f"2) confusion_matrix:\n\n", confusion_matrix(y_test, y_pred), "\n")
print(f"3) accuracy_score:\n\n", accuracy_score(y_test, y_pred))

1) classification_report:

               precision    recall  f1-score   support

           0       0.70      0.92      0.79        97
           1       0.89      0.62      0.73       103

    accuracy                           0.77       200
   macro avg       0.79      0.77      0.76       200
weighted avg       0.80      0.77      0.76       200

2) confusion_matrix:

 [[89  8]
 [39 64]] 

3) accuracy_score:

 0.765


    ☝🏻 quite some improvement can still be done.. 