In [187]:
# import libiaries
import numpy
import pandas as pd
import json
import multiprocessing
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import re
import joblib, pickle

[nltk_data] Downloading package stopwords to /Users/zazhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Summary Statistics

In [72]:
def process_yelp_line(line):
    # conver the text line to a json object
    json_object = json.loads(line)
    
    # read and tokenize the text
    text = json_object['text']
    
    # read the label and convert to an integer
    label = int(json_object['stars'])
    
    # return the text and the label
    if text:
        return text, label
    else:
        return None

In [5]:
# read the first 500,000 yelp reviews
lines = open('yelp_academic_dataset_review.json', encoding="utf8").readlines()[:500000]

In [73]:
# distribute the processing across the machine cpus
pool = multiprocessing.Pool(multiprocessing.cpu_count())
result = pool.map(process_yelp_line, lines)
result = list(filter(None, result))

In [74]:
# "unzip" the (tokens, label) tuples to a list of lists of tokens, and a list of labels
texts, labels = zip(*result)

In [222]:
# data cleaning 

stop_words = set(stopwords.words('english')) 

# convert to dataframe
data = pd.DataFrame({'text': texts, 'label': labels})

In [223]:
# keep only numbers, letters and space
data['text'] = data.apply(lambda t: re.sub(r'[^0-9A-Za-z ]', '', str(t['text'])), axis=1)

In [224]:
# remove stopwords and convert to lower case
data['text'] = data.apply(lambda r: ' '.join(w.lower() for w in r['text'].split() if w.lower() not in stop_words),axis=1)

# discard NA reviews
data = data.dropna()

In [225]:
# assign binary labels - 1 for rating>=3: good rating
data['binary_label'] = data.apply(lambda r: 0 if r['label'] < 4 else 1, axis=1)
data.to_csv('review_cleaned.csv')

In [237]:
data = pd.read_csv("review_cleaned.csv").drop('Unnamed: 0', axis=1)

In [238]:
data

Unnamed: 0,text,label,binary_label
0,someone worked many museums eager visit galler...,2,0
1,actually horrified place still business 3 year...,1,0
2,love deagans really atmosphere cozy festive sh...,5,1
3,dismal lukewarm defrostedtasting texmex glopmu...,1,0
4,oh happy day finally canes near casa yes other...,4,1
...,...,...,...
499995,kung fu tea havent amazing boba long time firs...,5,1
499996,wish one westside case boba shoplet start taro...,5,1
499997,new favorite spot dylan waitress absolutely fa...,5,1
499998,35 starsenvironment decor poor floor sometimes...,3,0


In [241]:
print("Number of documents is %s"%len(data))

Number of documents is 500000


In [242]:
print("Number of labels is %s"%len(set(labels)))

Number of labels is 5


In [243]:
print("Label distribution:\n", data.groupby('label').count().reset_index().rename(columns={'text':'# of reviews'}))

Label distribution:
    label  # of reviews  binary_label
0      1         70459         70468
1      2         40577         40577
2      3         55773         55778
3      4        112795        112802
4      5        220354        220375


In [244]:
print("Average word length of reviews is %s"%(np.mean([len(text.split(' ')) for text in texts])))

Average word length of reviews is 108.456586


## Logistic Regression

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

In [106]:
# tfidf
# unigram

# fit tfidf
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=500, norm='l2',
                    max_features=500, encoding='UTF-8',
                     ngram_range=(1,1), stop_words='english')


# get tfidf features in a sparse matrix
fe1 = tfidf.fit_transform(data['text'].tolist())

# turn into a dataframe of features
fe_df1 = pd.DataFrame.sparse.from_spmatrix(fe1)

In [107]:
print(fe1.shape)

(500000, 500)


In [108]:
# 1gram+2gram

# fit tfidf
tfidf2 = TfidfVectorizer(sublinear_tf=True, min_df=500, norm='l2',
                     max_features=500, encoding='UTF-8',
                     ngram_range=(1,2), stop_words='english')

# get tfidf features in a sparse matrix
fe2 = tfidf2.fit_transform(data['text'].tolist())

# turn into a dataframe of features
fe_df2 = pd.DataFrame.sparse.from_spmatrix(fe2)

In [109]:
print(fe2.shape)

(500000, 500)


In [245]:
# train test split
X_train1, X_test1, y_train1, y_test1 = train_test_split(fe_df1, data['binary_label'], test_size=0.3, random_state=66)
X_train2, X_test2, y_train2, y_test2 = train_test_split(fe_df2, data['binary_label'], test_size=0.3, random_state=66)

In [112]:
# unigram - default

# train a logistic regression model on tfidf features - default
lr_unigram = LogisticRegression(random_state=66)
lr_unigram.fit(X_train1, y_train1)

# make predictions on test set
y_pred_unigram = lr_unigram.predict(X_test1)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test1, y_pred_unigram))
print("Precision: %0.4f"%precision_score(y_test1, y_pred_unigram))
print("Recall: %0.4f"%recall_score(y_test1, y_pred_unigram))
print("F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram, average='micro'))

Accuracy: 0.8589
Precision: 0.8792
Recall: 0.9138
F1 score: 0.8962
Micro-averaged F1 score: 0.8589


In [116]:
# unigram - regularization and stopping criterion

# train a logistic regression model with regularization and stopping criteria on tfidf features
lr_unigram2 = LogisticRegression(C=0.6, random_state=66)
lr_unigram2.fit(X_train1, y_train1)

# make predictions on test set
y_pred_unigram2 = lr_unigram2.predict(X_test1)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test1, y_pred_unigram2))
print("Precision: %0.4f"%precision_score(y_test1, y_pred_unigram2))
print("Recall: %0.4f"%recall_score(y_test1, y_pred_unigram2))
print("F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram2))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram2, average='micro'))

Accuracy: 0.8590
Precision: 0.8791
Recall: 0.9142
F1 score: 0.8963
Micro-averaged F1 score: 0.8590


In [118]:
# 1gram+2gram - default

# train logistic model
lr_2gram = LogisticRegression(random_state=66)
lr_2gram.fit(X_train2, y_train2)

# make predictions on test set
y_pred_2gram = lr_2gram.predict(X_test2)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test2, y_pred_2gram))
print("Precision: %0.4f"%precision_score(y_test2, y_pred_2gram))
print("Recall: %0.4f"%recall_score(y_test2, y_pred_2gram))
print("F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram, average='micro'))

Accuracy: 0.8588
Precision: 0.8794
Recall: 0.9135
F1 score: 0.8961
Micro-averaged F1 score: 0.8588


In [252]:
# 1gram+2gram - regularization and stopping criterion

# train logistic model 
lr_2gram2 = LogisticRegression(C=0.6, random_state=66)
lr_2gram2.fit(X_train2, y_train2)

# make predictions on test set
y_pred_2gram2 = lr_2gram2.predict(X_test2)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test2, y_pred_2gram2))
print("Precision: %0.4f"%precision_score(y_test2, y_pred_2gram2))
print("Recall: %0.4f"%recall_score(y_test2, y_pred_2gram2))
print("F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram2))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram2, average='micro'))

Accuracy: 0.8589
Precision: 0.8792
Recall: 0.9138
F1 score: 0.8962
Micro-averaged F1 score: 0.8589


# SVM

In [123]:
from sklearn.svm import LinearSVC

In [125]:
# unigram - default

# train SVM model on tfidf features - default
svm_unigram = LinearSVC(random_state=66)
svm_unigram.fit(X_train1, y_train1)

# make predictions on test set
y_pred_unigram = svm_unigram.predict(X_test1)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test1, y_pred_unigram))
print("Precision: %0.4f"%precision_score(y_test1, y_pred_unigram))
print("Recall: %0.4f"%recall_score(y_test1, y_pred_unigram))
print("F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram, average='micro'))

Accuracy: 0.8588
Precision: 0.8780
Recall: 0.9154
F1 score: 0.8963
Micro-averaged F1 score: 0.8588


In [135]:
# unigram - penalty

# train SVM model 
svm_unigram2 = LinearSVC(random_state=66, loss='hinge', C=10)
svm_unigram2.fit(X_train1, y_train1)

# make predictions on test set
y_pred_unigram2 = svm_unigram2.predict(X_test1)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test1, y_pred_unigram2))
print("Precision: %0.4f"%precision_score(y_test1, y_pred_unigram2))
print("Recall: %0.4f"%recall_score(y_test1, y_pred_unigram2))
print("F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram2))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test1, y_pred_unigram2, average='micro'))

Accuracy: 0.8595
Precision: 0.8781
Recall: 0.9164
F1 score: 0.8969
Micro-averaged F1 score: 0.8595


In [131]:
# 1gram+2gram - default 

# train SVM with penalty
svm_2gram1 = LinearSVC(random_state=66)
svm_2gram1.fit(X_train2, y_train2)

# make predictions on test set
y_pred_2gram1 = svm_2gram1.predict(X_test2)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test2, y_pred_2gram1))
print("Precision: %0.4f"%precision_score(y_test2, y_pred_2gram1))
print("Recall: %0.4f"%recall_score(y_test2, y_pred_2gram1))
print("F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram1))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram1, average='micro'))

Accuracy: 0.8592
Precision: 0.8787
Recall: 0.9150
F1 score: 0.8965
Micro-averaged F1 score: 0.8592


In [140]:
# 1gram+2gram - regularization and stopping criterion

# train SVM with penalty
svm_2gram2 = LinearSVC(random_state=66, loss='hinge', C=10)
svm_2gram2.fit(X_train2, y_train2)

# make predictions on test set
y_pred_2gram2 = svm_2gram2.predict(X_test2)

# evaluation metrics
print("Accuracy: %0.4f"%accuracy_score(y_test2, y_pred_2gram2))
print("Precision: %0.4f"%precision_score(y_test2, y_pred_2gram2))
print("Recall: %0.4f"%recall_score(y_test2, y_pred_2gram2))
print("F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram2))
print("Micro-averaged F1 score: %0.4f"%f1_score(y_test2, y_pred_2gram2, average='micro'))

Accuracy: 0.8595
Precision: 0.8786
Recall: 0.9156
F1 score: 0.8968
Micro-averaged F1 score: 0.8595


In [145]:
# save best performing svm model
joblib.dump(svm_unigram2, 'svm_best.pkl')

# save the tfidf vectorizer
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']

In [147]:
with open('tfidf_vec.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

pickle.dump(svm_unigram2, open('yelp_svm.sav', 'wb'))

# fasttext

In [149]:
import pandas as pd
import numpy as np
!pip install fasttext
import fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.9 MB/s eta 0:00:01
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-macosx_10_13_x86_64.whl size=324173 sha256=61e6d17bc7af2c71643949bdcf78291ae81b81204321f9955d8d8efc5300e859
  Stored in directory: /Users/zazhu/Library/Caches/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
You should consider upgrading via the '/Users/zazhu/.venv/py37/bin/python3 -m pip install --upgrade pip' command.[0m


In [151]:
# modeling
# fasttext requires data to be in the format of: __label__1 text
data['new_label'] = data.apply(lambda t: '__label__' + str(t['binary_label']) + ' ' + str(t['text']),
                           axis=1)
# train test split
X = data['new_label']
y = data['binary_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=66)

In [154]:
# save train and test data
X_train.to_csv('fasttext_train.txt',index=False, header=False)
X_test.to_csv('fasttext_test.txt',index=False, header=False)

In [155]:
# fasttext model - default
ft_model = fasttext.train_supervised('fasttext_train.txt')

# calculate evaluation metrics
result = ft_model.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("Precision: %0.4f"%precision)
print("Recall: %0.4f"%recall)
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

Precision: 0.9015
Recall: 0.9015
F1 score: 0.9015


In [157]:
# fasttext model - setting 1
ft_model = fasttext.train_supervised('fasttext_train.txt',wordNgrams=2)

# calculate evaluation metrics
result = ft_model.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("Precision: %0.4f"%precision)
print("Recall: %0.4f"%recall)
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

Precision: 0.9116
Recall: 0.9116
F1 score: 0.9116


In [257]:
# fasttext model - setting 2
ft_model = fasttext.train_supervised('fasttext_train.txt',lr=0.8, wordNgrams=2)

# calculate evaluation metrics
result = ft_model.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("Precision: %0.4f"%precision)
print("Recall: %0.4f"%recall)
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

Precision: 0.9066
Recall: 0.9066
F1 score: 0.9066


In [256]:
# fasttext model - setting 3
ft_model = fasttext.train_supervised('fasttext_train.txt',lr=0.05, epoch=10, wordNgrams=2)

# calculate evaluation metrics
result = ft_model.test('fasttext_test.txt')
precision = result[1]
recall = result[2]
print("Precision: %0.4f"%precision)
print("Recall: %0.4f"%recall)
print("F1 score: %0.4f"%(2*precision*recall/(precision+recall)))

Precision: 0.9110
Recall: 0.9110
F1 score: 0.9110


# Use SVM model to predict 

In [163]:
# -*- coding: utf-8 -*-
"""predict_svm.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1C_Wz45F6ilIY7nV4pbBi_dnpeT0gFaAr
"""

import pickle
import numpy as np
import json
import spacy

In [166]:
model_svm = pickle.load(open('yelp_svm.sav', 'rb'))

with open('tfidf_vec.pkl', 'rb') as f:
    feature_transformer = pickle.load(f)

review = ['I love this restaurant! It is sooooo good!',
         'The delivery never came. I had to call them to cancel the order.',
         'The place is nice with large space and nice decoration.',
         'The food is okay but too pricy',
         'It is easy to park nearby, furnished recently, but too many people in the gym.']

label = [1, 0, 1, 0, 0]

In [190]:
# tokenize and normalize the documents
stop_words = set(stopwords.words('english')) 

# convert to dataframe
data = pd.DataFrame({'text': review, 'label': label})

# keep only numbers, letters and space
data['text'] = data.apply(lambda t: re.sub(r'[^0-9A-Za-z ]', '', str(t['text'])), axis=1)

# remove stopwords and convert to lower case
data['text'] = data.apply(lambda r: ' '.join(w.lower() for w in r['text'].split() if w.lower() not in stop_words),axis=1)

# discard NA reviews
data = data.dropna()

In [168]:
data

Unnamed: 0,text,label
0,love restaurant sooooo good,1
1,delivery never came call cancel order,0
2,place nice large space nice decoration,1
3,food okay pricy,0
4,easy park nearby furnished recently many peopl...,0


In [177]:
fe = feature_transformer.transform(data['text'].tolist())

confidence_score = model_svm.decision_function(fe)
y_pred = model_svm.predict(fe)

result = {}
for i in range(len(data)):
    result[str(i) + '__label: ' + str(label[i])] = {'predicted label': int(y_pred[i]),
                             'confidence score': confidence_score[i]}

In [178]:
print(json.dumps(result, indent=2))

{
  "0__label: 1": {
    "predicted label": 1,
    "confidence score": 2.696785887134676
  },
  "1__label: 0": {
    "predicted label": 0,
    "confidence score": -0.1881717078405898
  },
  "2__label: 1": {
    "predicted label": 1,
    "confidence score": 1.1902715709202243
  },
  "3__label: 0": {
    "predicted label": 0,
    "confidence score": -3.8353964250034984
  },
  "4__label: 0": {
    "predicted label": 1,
    "confidence score": 1.0917192647405367
  }
}


In [176]:
with open('svm_predcition.json', 'w') as f:
    json.dump(result, f)