## Import all libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

### Load the dataset

In [3]:
df=pd.read_csv(r'C:\Users\JIBSON JOY\Artificial_intelligence\computer_vision\Project\nlp\resturant\Restaurant_Reviews.tsv',delimiter="\t",quoting=3)

In [4]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Remove Punchation

In [5]:
stopwords_list = nltk.corpus.stopwords.words("english") # stopwords list for english
tokener = nltk.word_tokenize # to tokenize

def remove_punch(text):
    clean_words = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
    return clean_words

df.Review = df.Review.apply(remove_punch)

In [6]:
df.Review

0                               wow    loved this place 
1                                     crust is not good 
2              not tasty and the texture was just nasty 
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                             appetite instantly gone 
997    overall i was not impressed and would not go b...
998    the whole experience was underwhelming  and i ...
999    then  as if i hadn t wasted enough of my life ...
Name: Review, Length: 1000, dtype: object

### Remove more than one white space

In [7]:
def remove_whitespace(text):
    cleaned_space = re.sub(' +'," ",text)
    return cleaned_space
df.Review = df.Review.apply(remove_whitespace)

In [8]:
df.Review

0                                  wow loved this place 
1                                     crust is not good 
2              not tasty and the texture was just nasty 
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                             appetite instantly gone 
997    overall i was not impressed and would not go b...
998    the whole experience was underwhelming and i t...
999    then as if i hadn t wasted enough of my life t...
Name: Review, Length: 1000, dtype: object

### Remove stopwords

In [9]:
def remove_stopwords(text):
    tok_word = tokener(text)
    clean_words=[]
    for word in tok_word:
        if word in stopwords_list:
            continue
        else: 
            clean_words.append(word)
    words = ' '.join(clean_words)
    return words

df.Review = df.Review.apply(remove_stopwords)

In [10]:
df.Review

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                            selection menu great prices
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think go ninja ...
999    wasted enough life poured salt wound drawing t...
Name: Review, Length: 1000, dtype: object

### Remove digits

In [13]:
def remove_digits(text):
    clean_words = re.sub(r"[^a-zA-Z]"," ",text.lower())
    return clean_words

df.Review = df.Review.apply(remove_digits)
df.Review = df.Review.apply(remove_whitespace)

In [14]:
df.Review

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                            selection menu great prices
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think go ninja ...
999    wasted enough life poured salt wound drawing t...
Name: Review, Length: 1000, dtype: object

## Lematization


In [15]:
lm=nltk.WordNetLemmatizer()

def lemma(text):
    tok_word = tokener(text)
    lemma_word =[]
    # print("token :",tok_word)
    for tok in tok_word:
        lemma_tok = lm.lemmatize(tok)
        lemma_word.append(lemma_tok)
    # print("lemmatization :",lemma_word)
    final_text= " ".join(lemma_word)
    # print("Final text :",final_text)
    return final_text

df.Review = df.Review.apply(lemma)

In [16]:
df.Review

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                             selection menu great price
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think go ninja ...
999    wasted enough life poured salt wound drawing t...
Name: Review, Length: 1000, dtype: object

## Text Processing with TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(ngram_range=(1,3),max_features=2500)
scaled_X_tf =  tf_idf.fit_transform(df.Review)


## Convert to dataframe

In [18]:
df_tfidf_sklearn = pd.DataFrame(scaled_X_tf.toarray(),columns=tf_idf.get_feature_names())
df_tfidf_sklearn.head()



Unnamed: 0,absolutely,absolutely amazing,acknowledged,actually,added,ago,almost,also,also taste,although,...,wow,wrap,wrong,year,year ago,yet,yum,yummy,zero,zero star
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.572716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train Test Split

In [19]:
# train Test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_tfidf_sklearn,df['Liked'],test_size=0.3,random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier
clf_tfidf = RandomForestClassifier()
clf_tfidf.fit(X_train, y_train)

In [21]:
clf_tfidf.score(X_train,y_train)

0.9957142857142857

In [22]:
clf_tfidf.score(X_test,y_test)

0.7466666666666667

In [31]:
def predict_statement(sample_review):
    text = remove_punch(sample_review)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = remove_digits(text)
    text = remove_whitespace(text)
    text = lemma(text)
    temp = tf_idf.transform([text]).toarray()
    out = clf_tfidf.predict(temp)
    return out[0]


In [42]:
sample_review = 'nice car and nice colour'

if predict_statement(sample_review):
    print("This is a POSITIVE Review...")
else:
    print("This is a NEGATIVE Review...")

This is a POSITIVE Review...


