# Import Required Library

In [1]:
import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import Data

Given data have Restaurent Reviews given by Customers

In [4]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep= '\t')

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df.shape

(1000, 2)

In [7]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [13]:
df['Review']

0                               Wow... Loved this place.
1                                     Crust is not good.
2              Not tasty and the texture was just nasty.
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
                             ...                        
995    I think food should have flavor and texture an...
996                             Appetite instantly gone.
997    Overall I was not impressed and would not go b...
998    The whole experience was underwhelming, and I ...
999    Then, as if I hadn't wasted enough of my life ...
Name: Review, Length: 1000, dtype: object

In [14]:
df.isna().sum()

Review    0
Liked     0
dtype: int64

# Preprosessing

1. Tokenization

In [37]:
msg  = "Think and wonder, wonder and think."
tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(msg)

In [38]:
new_words

['Think', 'and', 'wonder', 'wonder', 'and', 'think']

2. Lower case all words

In [15]:
msg = 'Wow... Loved this place.'

In [16]:
msg = msg.lower()
msg

'wow... loved this place.'

3. remove unwanted chars 
(Keep Alphabets)

In [17]:
# Use regular expression library to remove unwanted characters
# Keep only a to z characters
import re  

In [18]:
msg = re.sub('[^a-z]',' ',msg)

In [19]:
msg

'wow    loved this place '

4.Remove Stopwords

In [20]:
# Use nltk library to Remove Stopwords
import nltk

In [21]:
nltk.corpus.stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
msg

'wow    loved this place '

5.Split Sentence

In [23]:
# split the sentance in words
list_of_words = msg.split()

In [24]:
list_of_words

['wow', 'loved', 'this', 'place']

In [25]:
# Prepair seperate list for words without stopwords
list_without_stopwords = []
for word in list_of_words:
    if word in nltk.corpus.stopwords.words('english'):
        print(word, 'is stopword')
    else:
        list_without_stopwords.append(word)

this is stopword


In [26]:
list_without_stopwords = [word for word in list_of_words if not word in nltk.corpus.stopwords.words('english')] 

In [27]:
list_without_stopwords

['wow', 'loved', 'place']

6. Stemming

In [28]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [29]:
ps.stem('loving')

'love'

In [30]:
# Prepare a list of stemmed words
list_of_stemmed_words = [ps.stem(word) for word in list_without_stopwords]

In [31]:
list_of_stemmed_words

['wow', 'love', 'place']

In [32]:
# Join all stemmed words
msg = ' '.join(list_of_stemmed_words)
msg

'wow love place'

Prepare get_processed function to do all preprocessing

In [44]:
def get_processed(msg):
    msg = re.sub('[^a-zA-Z]',' ',msg)
    msg = msg.lower()
    list_A = msg.split()
    list_B = [ps.stem(word) for word in list_A if not word in nltk.corpus.stopwords.words('english')]
    msg = ' '.join(list_B)
    return msg

In [45]:
get_processed('Wow... Loved this place.')

'wow love place'

Preprocessing 'Review' cloumn by using get_processed function

In [46]:
df['Review'].apply(get_processed)

0                                         wow love place
1                                             crust good
2                                     tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                         overal impress would go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

# Bag of Words

In [47]:
# Build BoW model
# Bag of Words (BOW) is a method to extract features from text documents. These features can be used for training machine
# learning algorithms. It creates a vocabulary of all the unique words occurring in all the documents in the training set.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
BoW = CountVectorizer()
TFIDF_table = TfidfVectorizer()
BoW_table = BoW.fit_transform(df['Review'].apply(get_processed))


In [48]:
BoW_table

<1000x1565 sparse matrix of type '<class 'numpy.int64'>'
	with 5372 stored elements in Compressed Sparse Row format>

# Create X and y

In [49]:
X = BoW_table.toarray()

In [50]:
X.shape

(1000, 1565)

In [51]:
sum(X[0])

3

In [52]:
y = df['Liked']

# RandomForestClassifier

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
model = RandomForestClassifier(random_state=56)
model.fit(X,y)

RandomForestClassifier(random_state=56)

# Model Score

In [55]:
model.score(X,y)

0.997

# Trial Prediction

In [65]:
new_review = 'Very nearer and good service'

In [66]:
temp = get_processed(new_review)

In [67]:
a1 = BoW.transform([temp])
a1 = a1.toarray()
a1

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [68]:
model.predict(a1)[0]

1

In [60]:
def get_sentiment(review):
    review = get_processed(review)
    a1 = BoW.transform([review])
    a1 = a1.toarray()
    return model.predict(a1)[0]

In [64]:
get_sentiment('service was good')

1