In [97]:
import pandas as pd
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords 
import string
from sklearn.feature_extraction.text import CountVectorizer

In [142]:
df=pd.read_csv("yelp.csv")
df.drop(columns=['business_id','date','review_id','user_id','type'],inplace=True)
df['review']=df['stars'].apply(lambda x:"positive" if x>3 else "negative")
df['review_length']=df['text'].apply(lambda x:len(x))

#### Review length and stars are negatively correlated

In [143]:
stars=df.groupby(['stars']).mean()
stars.corr()

Unnamed: 0,cool,useful,funny,review_length
cool,1.0,-0.743329,-0.944939,-0.857664
useful,-0.743329,1.0,0.894506,0.699881
funny,-0.944939,0.894506,1.0,0.843461
review_length,-0.857664,0.699881,0.843461,1.0


In [68]:
X=df[['text']]
Y=df['stars']

In [99]:
def text_analyzer(message):
    
    #remove punctuations
    message=[char for char in message if char not in string.punctuation]
    
    #rejoin to make string
    message=''.join(message)
    
    #tokenize to words
    mess=word_tokenize(message)
    
    #convert to lower case
    mess=[x.lower() for x in mess]
    
    #remove stopwords
    stop_words = set(stopwords.words('english')) 
    mess=[x for x in mess if x not in stop_words]
    return mess

In [100]:
X['text'].head(5).apply(text_analyzer)

0    [wife, took, birthday, breakfast, excellent, w...
1    [idea, people, give, bad, reviews, place, goes...
2    [love, gyro, plate, rice, good, also, dig, can...
3    [rosie, dakota, love, chaparral, dog, park, co...
4    [general, manager, scott, petello, good, egg, ...
Name: text, dtype: object

### Create the feature vector

In [101]:
vectorizer=CountVectorizer(analyzer=text_analyzer)
vectorizer.fit(X['text'])
print(vectorizer.get_feature_names())



### Transform the text into a matrix format

In [115]:
message=X['text'][3]
X_transformed=vectorizer.transform(X['text'])
print (X_transformed)

  (0, 409)	1
  (0, 1267)	1
  (0, 1268)	1
  (0, 2046)	2
  (0, 2357)	1
  (0, 2661)	1
  (0, 3185)	1
  (0, 3949)	2
  (0, 3977)	1
  (0, 4148)	1
  (0, 4295)	1
  (0, 4353)	1
  (0, 4795)	1
  (0, 4820)	1
  (0, 5593)	1
  (0, 5679)	1
  (0, 7493)	1
  (0, 9253)	1
  (0, 10726)	1
  (0, 10916)	1
  (0, 11521)	2
  (0, 11548)	1
  (0, 11612)	3
  (0, 12152)	1
  (0, 12393)	1
  :	:
  (9999, 14404)	1
  (9999, 16716)	1
  (9999, 17190)	1
  (9999, 18203)	1
  (9999, 18515)	1
  (9999, 18859)	2
  (9999, 18969)	1
  (9999, 19479)	1
  (9999, 20507)	1
  (9999, 20585)	1
  (9999, 22651)	1
  (9999, 23409)	1
  (9999, 24248)	2
  (9999, 25989)	1
  (9999, 27775)	1
  (9999, 28333)	1
  (9999, 28459)	1
  (9999, 30087)	1
  (9999, 30123)	1
  (9999, 30448)	1
  (9999, 31435)	1
  (9999, 32356)	2
  (9999, 32851)	1
  (9999, 33353)	1
  (9999, 34710)	1


In [117]:
# Get the name of the feature that occured 4 times
vectorizer.get_feature_names()[32356]

'think'

### Tfidf transformer

In [125]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(X_transformed)
tfidf_transformed=tfidf_transformer.transform(X_transformed)
print (tfidf_transformed.shape)

(10000, 36267)


In [122]:
message3=vectorizer.transform([X['text'][3]])
tfidf_transformer3=tfidf_transformer.transform(message3)
print (tfidf_transformer3)

  (0, 35860)	0.19868683869573364
  (0, 35625)	0.08735885731133755
  (0, 33132)	0.1401627603787869
  (0, 31320)	0.14959858352881644
  (0, 29558)	0.19022218609404493
  (0, 28556)	0.16128134769413638
  (0, 28047)	0.08228108357896975
  (0, 27390)	0.09731251471140746
  (0, 27271)	0.19022218609404493
  (0, 26041)	0.17575176689409064
  (0, 24593)	0.19868683869573364
  (0, 24381)	0.10883344549251575
  (0, 23496)	0.33017513162112455
  (0, 23350)	0.4131648872351035
  (0, 20685)	0.19022218609404493
  (0, 19170)	0.05749331496275254
  (0, 19126)	0.0723901709743486
  (0, 18959)	0.1020319616181375
  (0, 18618)	0.08642209653182101
  (0, 18248)	0.1468109284941821
  (0, 17824)	0.13107488531691633
  (0, 17524)	0.09429261677550092
  (0, 16103)	0.07862698311529571
  (0, 12421)	0.07094000301503434
  (0, 12330)	0.1528166950924477
  (0, 12261)	0.17575176689409064
  (0, 10595)	0.16128134769413638
  (0, 10135)	0.1150281400953042
  (0, 10121)	0.10498026132249165
  (0, 9428)	0.10606343000436337
  (0, 9392)	0.1612

### Run MultinomialNB algorithm

In [131]:
from sklearn.naive_bayes import MultinomialNB
rating_detect_model = MultinomialNB().fit(tfidf_transformed,df['review'])
all_predictions=rating_detect_model.predict(tfidf_transformed)
print (all_predictions)

['positive' 'positive' 'positive' ... 'positive' 'positive' 'positive']


In [132]:
print ('Predicted: ',rating_detect_model.predict(tfidf_transformer3)[0] )
print ('Expected: ',df['review'][3])

Predicted:  positive
Expected:  positive


In [133]:
from sklearn.metrics import classification_report
print (classification_report(df['review'], all_predictions))

              precision    recall  f1-score   support

    negative       1.00      0.00      0.01      1676
    positive       0.83      1.00      0.91      8324

   micro avg       0.83      0.83      0.83     10000
   macro avg       0.92      0.50      0.46     10000
weighted avg       0.86      0.83      0.76     10000



### Divide into train and test

In [144]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(X['text'], df['review'], test_size=0.2)

print (len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

8000 2000 10000


In [145]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('bow',CountVectorizer(analyzer =text_analyzer)),
                    ('tfidf',TfidfTransformer()),
                    ('classifier',MultinomialNB())])
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_analyzer at 0x000002CA356B3AE8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [146]:
predictions = pipeline.predict(msg_test)
print (classification_report(predictions,label_test))

              precision    recall  f1-score   support

    negative       0.01      0.90      0.03        10
    positive       1.00      0.69      0.82      1990

   micro avg       0.69      0.69      0.69      2000
   macro avg       0.51      0.79      0.42      2000
weighted avg       0.99      0.69      0.81      2000

