In [1]:
# package
from pathlib import Path
import json
import pickle
import os
import numpy as np
import pandas as pd
from textblob import TextBlob
from scipy import sparse
from time import time

# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
np.random.seed(2021)

In [2]:
data_dir = Path('..', 'data')

## Load Data

In [3]:
# clean data
data = pd.read_json(data_dir / 'yelp_academic_dataset_review.json', lines=True)
year = data.date.apply(lambda x: x.year)
data.drop(['user_id', 'business_id', 'date'], axis=1, inplace=True)
data['year'] = year

In [4]:
# train / test split
train = data[data.year < 2019]
test = data[data.year == 2019]

In [5]:
# save data
train.to_csv('../data/train.csv')
test.to_csv('../data/test.csv')

In [6]:
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)

## Create Yelp review document-term matrix

In [7]:
def create_doc_matrix(train, test=None, path='data'):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)
    train_dtm = vectorizer.fit_transform(train.text)
    sparse.save_npz(path / 'train_dtm', train_dtm)
    if test is not None:
        test_dtm = vectorizer.transform(test.text)
        sparse.save_npz(path / 'test_dtm', test_dtm)
        return train_dtm, test_dtm
    else:
        return train_dtm

In [8]:
# save
train_dtm, test_dtm = create_doc_matrix(train, test, path=data_dir)

In [9]:
# reload
train_dtm = sparse.load_npz(data_dir / 'train_dtm.npz')
test_dtm = sparse.load_npz(data_dir / 'test_dtm.npz')

## Benchmark Accuracy

In [10]:
accuracy, runtime = {}, {}
predictions = test[['stars']].copy()

In [11]:
naive_prediction = np.full_like(predictions.stars, fill_value=train.stars.mode().iloc[0])
naive_benchmark = accuracy_score(predictions.stars, naive_prediction)
naive_benchmark

0.5117779042568241

## Model Evaluation Helper

In [12]:
def evaluate_model(model, X_train, X_test, name, store=False):
    start = time()
    model.fit(X_train, train.stars)
    runtime[name] = time() - start
    predictions[name] = model.predict(X_test)
    accuracy[name] = accuracy_score(test.stars, predictions[name])
    if store:
        joblib.dump(model, f'../results/{name}.joblib')

## Multiclass Naive Bayes

In [13]:
nb = MultinomialNB()
param_nb = {'alpha': np.logspace(-4, 4, 9)}
clf_nb = GridSearchCV(nb, param_nb).fit(train_dtm, train.stars)
print(f"The best alpha is {clf_nb.best_params_['alpha']}.")

The best alpha is 100.0.


In [14]:
nb = MultinomialNB(alpha=clf_nb.best_params_['alpha'])
result = 'nb_text'
evaluate_model(nb, train_dtm, test_dtm, result, store=True)
print("Multiclass Naive Bayes: ", accuracy[result])

Multiclass Naive Bayes:  0.6557891031356202


## Multinomial Logistic Regression

In [15]:
log_reg_text_accuracy = {}
log_reg_text_runtime = []
Cs = np.logspace(-4, 4, 9)

In [16]:
for i, C in enumerate(Cs):
    start = time()
    model = LogisticRegression(C=C,
                               multi_class='multinomial',
                               solver='lbfgs')
    
    model.fit(train_dtm, train.stars)
    log_reg_text_runtime.append(time() - start)
    log_reg_text_accuracy[C] = accuracy_score(test.stars,
                                              model.predict(test_dtm))

    print(f'{C:12.5f}: {log_reg_text_runtime[i]:.2f}s | {log_reg_text_accuracy[C]:.2%}', flush=True)

     0.00010: 91.59s | 70.90%
     0.00100: 141.64s | 73.96%
     0.01000: 139.28s | 74.81%
     0.10000: 140.92s | 74.78%
     1.00000: 137.63s | 74.76%
    10.00000: 140.12s | 74.74%
   100.00000: 138.22s | 74.78%
  1000.00000: 141.09s | 74.74%
 10000.00000: 141.77s | 74.76%


In [17]:
accuracy['lr_text'] = pd.Series(log_reg_text_accuracy).max()
runtime['lr_text'] = np.mean(log_reg_text_runtime)
lr_best = max(log_reg_text_accuracy, key=log_reg_text_accuracy.get)

### - SGD Classifier

In [18]:
# online learning
clf = SGDClassifier(loss='log', random_state=2021)
classes = np.array(range(1,6))
clf.fit(train_dtm, train.stars)
pickle.dump(clf, open('../results/classifier.pkl', 'wb'), protocol=4)

SGDClassifier(loss='log', random_state=2021)

## Gradient Boosting

In [21]:
lgb_train = lgb.Dataset(data=train_dtm.tocsr().astype(np.float32), 
                        label=train.stars.sub(1), 
                        categorical_feature=list(range(train_dtm.shape[1])))

lgb_test = lgb.Dataset(data=test_dtm.tocsr().astype(np.float32), 
                       label=test.stars.sub(1), 
                       reference=lgb_train)

In [22]:
param = {'objective': 'multiclass',
         'metrics': ['multi_error'],
         'num_class': 5}
booster = lgb.train(params=param,
                    train_set=lgb_train,
                    num_boost_round=2000,
                    early_stopping_rounds=25,
                    valid_sets=[lgb_train, lgb_test],
                    verbose_eval=25)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43360
[LightGBM] [Info] Number of data points in the train set: 1701322, number of used features: 10000
[LightGBM] [Info] Start training from score -1.871485
[LightGBM] [Info] Start training from score -2.510575
[LightGBM] [Info] Start training from score -2.208518
[LightGBM] [Info] Start training from score -1.515108
[LightGBM] [Info] Start training from score -0.831868
Training until validation scores don't improve for 25 rounds
[25]	training's multi_error: 0.401738	valid_1's multi_error: 0.320236
[50]	training's multi_error: 0.371502	valid_1's multi_error: 0.298015
[75]	training's multi_error: 0.357112	valid_1's multi_error: 0.287366
[100]	training's multi_error: 0.34787	valid_1's multi_error: 0.28078
[125]	training's multi_error: 0.341596	valid_1's multi_error: 0.276105
[150]	training's multi_error: 0.336934	valid_1's multi_error: 0.272693
[175]	training's multi_error: 0.332912	valid_1's multi_er

## Sqlite

In [23]:
import sqlite3

In [24]:
db_path = '../results/reviews.sqlite'
document = "The food is good!"
y = 5
if os.path.exists(db_path):
    os.remove(db_path)
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute(
    "CREATE TABLE review_db "\
    "(review TEXT, star INTEGER, date TEXT)"
)
c.execute(
    "INSERT INTO review_db (review, star, date) "\
    "VALUES (?, ?, DATETIME('now'))", (document, y))
conn.commit()
conn.close()

In [25]:
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute(
    "SELECT * FROM review_db"
)
results = c.fetchall()
conn.close()
print(results)

[('The food is good!', 5, '2021-03-16 13:39:08')]


## Tokenizer & Hashingvectorizer

In [21]:
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [38]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [39]:
import os
import pickle
pickle.dump(stop, open(os.path.join('../results/stopwords.pkl'),'wb'), protocol=4)

In [40]:
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

In [41]:
train_dtm = vect.transform(train.text)
test_dtm = vect.transform(test.text)

In [42]:
clf = SGDClassifier(loss='log', random_state=2021)
classes = np.array(range(1,6))
clf.fit(train_dtm, train.stars)

SGDClassifier(loss='log', random_state=2021)

In [43]:
pickle.dump(clf, open('../results/classifier_hash.pkl', 'wb'), protocol=4)