In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
#delimiter = "\t" means tab
#quoting = 3 means we are ignoring the double quotes in the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)

In [4]:
#use vpn so that stopwords can be downloaded

#importing text cleaning libraries
import re 
#nltk helps to remove all useless words (stopwords) such as a, the. and ...
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#stemming only concentrates on the roots of the word (loved --> love)
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#the list corpus will contain all text cleaned reviews
corpus = []
#iterate through dataset
for i in range(0, 1000):
    #removing all punctuations
    #meaning of [^a-zA-Z]: anything except...
    #meaning of ' ': replacement by space
    #meaning of dataset['Review'][i]: place where the cleaning step should happen 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    #transform capital letters to lower case by updating the 'review' variable
    review = review.lower()
    #split each review in different words, so that we can apply stemming
    review = review.split()
    #apply stemming (reduce the words to their roots)
    #with 'ps' we created an object to apply stemming
    ps = PorterStemmer()
    #actually stopwords also includes the word 'not'
    #but we have to keep it because it is a clearly negative indicator
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    #update 'review' by creating a list with the stemmed words
    #this can be done with a single line for loop
    #by applying ps.stem() to the iterator 'word'
    #here we get also rid of the stopwords
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    #join the splitted words together in one review
    #' ' means to separate the words with a space
    review = ' '.join(review)
    #putting each cleaned 'review' into the 'corpus' list 
    corpus.append(review)

In [6]:
#create bag of words model
#rows of sparse matrix: different reviews
#columns of sparse matrix: cleaned words 
#cells of sparse matrix: 1 (word is in review) or 0 (word is not in review)
#the above described matrix creation is done with tokenization
from sklearn.feature_extraction.text import CountVectorizer
#create an instance of the class 'CountVectorizer'
#in () we have to enter the maximum number of words
#we set this number after we know what is the number of the most frequent words
cv = CountVectorizer(max_features=1500)
#'X' stands for our sparse matrix
#'toarray()' transforms it to a 2D array
X = cv.fit_transform(corpus).toarray()
#create dependent vector y
y = dataset.iloc[:, -1].values


In [7]:
#split dataset in training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
#train random forest model
from sklearn.ensemble import RandomForestClassifier
class_rf = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
class_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=30, random_state=0)

In [10]:
y_pred = class_rf.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[90  7]
 [40 63]]


0.765

In [15]:
def check_review(new_review):
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower()
    new_review = new_review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    new_X_test = cv.transform(new_corpus).toarray()
    new_y_pred = class_rf.predict(new_X_test)
    return print(new_y_pred)

In [27]:
check_review('We loved it')

[1]
