NLP Model for Sentiment Classification


Imports libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import string
import nltk
import spacy
import os
import sys

from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import en_core_web_md
text_to_nlp = en_core_web_md.load()

Imports data

In [2]:
import gdown
gdown.download("https://drive.google.com/uc?id=1u0tnEF2Q1a7H_gUEH-ZB3ATx02w8dF4p","yelp_data.csv",quiet = True)
data_file = "yelp_data.csv"

Explore Data

In [3]:
yelp_full_data = pd.read_csv(data_file)
# print(yelp_full_data.head())

# print("------------------------------------------------------------------------------------------------------------------------")
#Only need stars and text data
want_columns = ["stars","text"]
yelp_data = yelp_full_data[want_columns].copy()
print(yelp_data.head())

print("------------------------------------------------------------------------------------------------------------------------")

#Look at data for different stars
star_num = 3
for s in yelp_data[yelp_data["stars"] == star_num]["text"].head(20).values:
    print(s)

   stars                                               text
0      5  My wife took me here on my birthday for breakf...
1      5  I have no idea why some people give bad review...
2      5  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
3      5  General Manager Scott Petello is a good egg!!!...
4      5  Drop what you're doing and drive here. After I...
------------------------------------------------------------------------------------------------------------------------
We went here on a Saturday afternoon and this place was incredibly empty.  They had brunch specials going on, including $2 bloody mary's and mimosas, but we were more in the mood for lunch.  Except for the bloody mary, I had to try one.  It came out in a high-ball-sized glass.  Boo!  But it was really tasty. Yay!  The hubby remembered a sign outside the restaurant a few weeks back that said they had Arrogant Bastard, and he got a 22 oz bottle for $4.75.  Hey, that's not fair!!

Next up: the wings.  We were a bit h

4-5 Stars are good reviews while 3-5 stars are bad

Binary Classification

In [4]:
def is_good(num_stars):
    if num_stars > 3:
        return True
    else:
        return False

yelp_data.loc[:,"is_good_review"] = yelp_data["stars"].apply(is_good)
print(yelp_data.head())


   stars                                               text  is_good_review
0      5  My wife took me here on my birthday for breakf...            True
1      5  I have no idea why some people give bad review...            True
2      5  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...            True
3      5  General Manager Scott Petello is a good egg!!!...            True
4      5  Drop what you're doing and drive here. After I...            True


Preprocessing

Represents data numerically


Tokenization Example


In [5]:
example_txt = "Everyone there was quite kind, and I had a good time."
tokens = word_tokenize(example_txt)
print(tokens)

['Everyone', 'there', 'was', 'quite', 'kind', ',', 'and', 'I', 'had', 'a', 'good', 'time', '.']


Stopwords Example

In [6]:
example_stops = []
for word in tokens:
    if(word.lower() in STOP_WORDS):
        example_stops.append(word)
print(example_stops)

['Everyone', 'there', 'was', 'quite', 'and', 'I', 'had', 'a']


Vectorization

Bag of Words

    Count the number of times each word from our vocabulary was used

Text Cleaning

In [10]:
X_text = yelp_data["text"]
y = yelp_data["is_good_review"]

def tokenize(txt):
    clean_tokens = []
    for token in text_to_nlp(txt):
        if (not token.is_stop) and (token.lemma_ != "-PRON-") and (not token.is_punct): # PRON is the lemma for personal pronouns
            clean_tokens.append(token.lemma_)
    return clean_tokens

bow_transform = CountVectorizer(analyzer=tokenize,max_features=2000).fit(X_text)

print(bow_transform.vocabulary_)

print(len(bow_transform.vocabulary_))



{'wife': 1955, 'take': 1780, 'birthday': 403, 'breakfast': 437, 'excellent': 769, ' ': 6, 'weather': 1941, 'perfect': 1351, 'sit': 1648, 'outside': 1304, 'ground': 943, 'absolute': 251, 'pleasure': 1389, 'waitress': 1924, 'food': 860, 'arrive': 326, 'quickly': 1462, 'busy': 462, 'Saturday': 194, 'morning': 1229, 'look': 1139, 'like': 1120, 'place': 1377, 'fill': 828, 'pretty': 1424, 'early': 726, 'well': 1949, '\n\n': 1, 'favor': 811, 'phenomenal': 1362, 'simply': 1644, 'good': 924, 'sure': 1766, 'use': 1890, 'ingredient': 1052, 'garden': 896, 'blend': 411, 'fresh': 881, 'order': 1297, 'amazing': 294, 'menu': 1199, 'white': 1952, 'truffle': 1856, 'egg': 737, 'vegetable': 1902, 'tasty': 1788, 'delicious': 658, 'come': 553, '2': 22, 'piece': 1371, 'bread': 435, 'absolutely': 252, 'meal': 1185, 'complete': 561, 'toast': 1822, 'wait': 1922, 'idea': 1034, 'people': 1348, 'bad': 357, 'review': 1533, 'go': 919, 'probably': 1434, 'fault': 810, 'case': 485, 'friend': 884, 'pm': 1392, 'past': 13

Transform each review into BoW

In [13]:
X = bow_transform.transform(X_text)
print(pd.DataFrame(X.toarray()))
print(X_text.size)

     0     1     2     3     4     5     6     7     8     9     ...  1990  \
0       0     3     0     0     0     0     8     0     0     0  ...     0   
1       0     2     0     0     0     0     1     0     0     0  ...     0   
2       0     2     0     0     0     0     1     0     0     0  ...     0   
3       0     1     0     0     0     0     0     0     0     0  ...     0   
4       0     3     0     0     0     0    18     0     0     0  ...     0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
995     0     0     0     0     0     0     2     0     1     0  ...     0   
996     0     0     0     0     0     0     1     0     0     0  ...     0   
997     0    11     0     0     0     0     0     0     0     0  ...     0   
998     0     0     0     0     0     0     0     0     0     0  ...     0   
999     3     0     0     0     1     0     3     0     0     0  ...     0   

     1991  1992  1993  1994  1995  1996  1997  1998  1999  
0  

USING LOGISTIC REGRESSION

    -Train and Test

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 101)

    Fitting


In [23]:
logistic_model = LogisticRegression()

logistic_model.fit(X_train,y_train)

    Testing
    

In [24]:
y_pred = logistic_model.predict(X_test)
acc_score = accuracy_score(y_test,y_pred)
print(acc_score)

0.75


Trying reviews

In [36]:
example_review = "The food was quite disappointing. I expected better from such a renowned restaurant."
pred = logistic_model.predict(bow_transform.transform([example_review]))

if(pred):
    print("GOOD")
else:
    print("BAD")

BAD
