## NLP - Decision Tree

Importing the libraries

In [34]:
import numpy as np
import pandas as pd

Importing the dataset

In [35]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

Cleaning the text

In [36]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [37]:
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in dataset['Review']:
    review = re.sub('[^a-zA-Z]', ' ', i)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('English')#.remove('not')
    # 'remove' function must be applied separately otherwise 'all_stopwords' wouldn't be iterative!
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # 'join' concatenates the elements of the list and trasform the list to string.
    review = ' '.join(review)
    corpus.append(review)
#corpus

Building the Bag Of Words model

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[: , -1].values
print(X)
X.shape

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(1000, 1500)

Splitting the dataset into train set and test set

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = .2, random_state = 0)

Training the classification model - Decision Tree

In [40]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

Predicting the results on the test set

In [41]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

Confusion Matrix and Accuracy Score

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is: \n', cm)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy is: {:.2f} %'.format(accuracy*100))

precision = precision_score(y_test, y_pred)
print('Precision is: {:.2f} %'.format(precision*100))

recall = recall_score(y_test, y_pred)
print('Recall is: {:.2f} %'.format(recall*100))

f1 = f1_score(y_test, y_pred)
print('F1 is: {:.2f} %'.format(f1*100))

Confusion Matrix is: 
 [[78 19]
 [31 72]]
Accuracy is: 75.00 %
Precision is: 79.12 %
Recall is: 69.90 %
F1 is: 74.23 %


Semantic analysis of a specific reviwe

In [43]:
sp_review = 'I do not hate the food'
review = re.sub('[^a-zA-Z]', ' ', sp_review)
print('1 ', review)
review = review.lower()
print('2 ', review)
review = review.split()
print('3 ', review)
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
print('4 ', review)
review = ' '.join(review)
print('5 ', review)

sp_y_pred = classifier.predict(cv.transform([review]).toarray())
print(sp_y_pred)

1  I do not hate the food
2  i do not hate the food
3  ['i', 'do', 'not', 'hate', 'the', 'food']
4  ['not', 'hate', 'food']
5  not hate food
[0]
