# Sentiment Analysis Yelp

In [5]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [6]:
IMDB_DATA_FILE ='imdb_labelled.txt'
YELP_DATA_FILE = 'yelp_labelled.txt'
AMAZON_DATA_FILE = 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Yelp

In [7]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)


In [8]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [9]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
661,"Very convenient, since we were staying at the MGM!",1
211,"If you hate earbugs, avoid this phone by all means.",0
178,"Not a weekly haunt, but definitely a place to come back to every once in a while.",1
259,very tough and very short on flavor!,0
401,"Overall, I was very disappointed with the quality of food at Bouchon.",0
169,Bacon is hella salty.,1
382,Same problem as others have mentioned.,0
893,Worst Thai ever.,0
787,"I don't have very many words to say about this place, but it does everything pretty well.",1
134,"Delicious NYC bagels, good selections of cream cheese, real Lox with capers even.",1


In [10]:
review_data.Sentiment.value_counts()

1    1500
0    1500
Name: Sentiment, dtype: int64

In [11]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [12]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [13]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
344,the selection was probably the worst i ve seen in vegas there was none,0
735,provides good protection and looks classy too,1
168,doesn t last long,0
603,good value great food great service,1
46,it s too bad the food is so damn generic,0
863,i love their fries and their beans,1
951,very much disappointed with this company,0
40,the shrimp tender and moist,1
613,sorry i will not be getting food from here anytime soon,0
723,think it over when you plan to own this one this sure is the last moto phone for me,0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [16]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [18]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [19]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 89%'

In [20]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])

In [25]:
log_tfidf.predict(['I dont hate the hotel', 'it was not bad place' ])

array([0, 0])