# Summary Stats

In [1]:
import json
import pandas as pd
import numpy as np
import multiprocessing
from nltk.tokenize import word_tokenize
from collections import Counter
from statistics import mean

# from sklearn.feature_extraction.text import CountVectorizer
# # from sklearn.feature_extraction.text import TfidTransformer
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC

In [2]:
# The summary statistics should include
# the number of documents, number of labels, label distribution, average / mean word length of documents.

In [3]:
def pre_process(reviews):
    star_ls = []
    text_ls = []
    doc_len_ls = []
    for review in reviews:
        dat = json.loads(review)
        star = dat['stars']
        text = dat['text']
        star_ls.append(star)
        text_ls.append(text)
        doc_len_ls.append(len(word_tokenize(text)))
    return star_ls, text_ls, doc_len_ls  

In [4]:
# main
lines = open('yelp_academic_dataset_review.json', encoding="utf8").readlines()[:500000]

In [5]:
star_ls, text_ls, doc_len_ls = pre_process(lines)

In [6]:
# Number of documents
num_doc = len(lines)
# Number of labels
num_labels = len(set(star_ls))
# Label distribution
distribution = Counter(star_ls)
# average / mean word length of documents
avg_len = mean(doc_len_ls)

In [7]:
print("Number of documents: ", num_doc)
print("Number of labels: ", num_labels)
print("Label distribution: ", distribution)
print("Average word length of documents: ", avg_len)

Number of documents:  500000
Number of labels:  5
Label distribution:  Counter({5.0: 220375, 4.0: 112802, 1.0: 70468, 3.0: 55778, 2.0: 40577})
Average word length of documents:  123.175352


In [8]:
# Clean data
from nltk.corpus import stopwords

def tokenization(text, stopwords):
    # Apply nltk word tokenization, convert to lower cases, remove non-alphabetic chars and stopwords
    review_tokenized = [i.lower() for i in word_tokenize(text) if i.isalpha() and i not in stopwords]
    return review_tokenized

def cleaning(stars, texts):
    # List of stopwords that do not add much meaning to a sentence
    stop_words = set(stopwords.words('english'))
    texts_tokenized = [tokenization(i, stop_words) for i in texts]
    cleaned_df = pd.DataFrame({'star': stars, 'text': texts_tokenized})
    # Create binary lables: 1 for ratings > 3, 0 for ratings <= 3. This variable is created to apply logistic regression on multi-label class.
    cleaned_df['binary_label'] = cleaned_df.apply(lambda x: 1 if x['star'] > 3 else 0, axis=1)
    # Remove NAs
    cleaned_df = cleaned_df.dropna()
    return cleaned_df

cleaned_df = cleaning(star_ls, text_ls)

      

In [9]:
print("The first 10 rows of cleaned dataset: \n", cleaned_df.head(10))

The first 10 rows of cleaned dataset: 
    star                                               text  binary_label
0   2.0  [as, someone, worked, many, museums, i, eager,...             0
1   1.0  [i, actually, horrified, place, still, busines...             0
2   5.0  [i, love, deagan, i, i, really, the, atmospher...             1
3   1.0  [dismal, lukewarm, texmex, glop, mumbly, uneng...             0
4   4.0  [oh, happy, day, finally, canes, near, casa, y...             1
5   5.0  [this, definitely, favorite, fast, food, sub, ...             1
6   5.0  [really, good, place, simple, decor, amazing, ...             1
7   5.0  [awesome, office, staff, professional, friendl...             1
8   5.0  [most, delicious, authentic, italian, i, us, y...             1
9   4.0  [i, twice, very, nice, laid, back, i, tried, w...             1


In [10]:
cleaned_df.to_csv('cleaned_reviews.csv')
print("Cleaned dataframe successfully saved into CSV as 'cleaned_reviews.csv'")

Cleaned dataframe successfully saved into CSV as 'cleaned_reviews.csv'


# Logistic Regression

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [12]:
# Run 3 or more sets of experiments,
# varying the bag-of-word representation (1gram vs 1gram+2gram),
# and 2 or more hyperparameter settings.

# Write a several-sentence rationale for your choice of parameters. 
# Produce a table summarizing your experiments: parameter values, BOW representation, results 
# in terms of Precision, Recall, F1-score, micro-averaged F1-score, or Accuracy.

In [13]:
def split_data(data):
    X_train, X_test, y_train, y_test = train_test_split(data['text'], data['binary_label'], test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

In [14]:
def tfidf_transform(vectorizer, train, test):
    features_train = vectorizer.fit_transform(train)
    features_test = vectorizer.fit_transform(test)
    return features_train, features_test

In [15]:
def score_metrics(test, pred):
    print("Precision:", precision_score(test, pred))
    print("Recall: ", recall_score(test, pred))
    print("F1 score: ", f1_score(test, pred))
    print("Accuracy: ", accuracy_score(test, pred))
    print("Micro-averaged F1-score: ", f1_score(test, pred, average = 'micro'))

In [16]:
# main

# Read data in from CSV
data = pd.read_csv("cleaned_reviews.csv")
data = data.drop('Unnamed: 0', axis = 1)

# Split data into test and train sets
X_train, X_test, y_train, y_test = split_data(data)

In [17]:
# 1gram, C=0.5, max_iter = 400
tfidf_1gram = TfidfVectorizer(min_df = 100, max_features = 300, ngram_range=(1,1), sublinear_tf = True, stop_words = 'english')
train_x_1gram, test_x_1gram = tfidf_transform(tfidf_1gram, X_train, X_test)

print("Results for 1gram with C=0.5 and max_iter = 400")
lr1 = LogisticRegression(C=0.5, random_state=42, max_iter=400)
model1 = lr1.fit(train_x_1gram, y_train)
pred1 = model1.predict(test_x_1gram)
score_metrics(y_test, pred1)

Results for 1gram with C=0.5 and max_iter = 400
Precision: 0.7611896806278654
Recall:  0.7122045204620432
F1 score:  0.7358828086484963
Accuracy:  0.6590727272727273
Micro-averaged F1-score:  0.6590727272727273


In [18]:
# 1gram, C=5, max_iter = 400
print("Results for 1gram with C=5 and max_iter = 400")
lr2 = LogisticRegression(C=5, random_state=42, max_iter=400)
model2 = lr2.fit(train_x_1gram, y_train)
pred2 = model2.predict(test_x_1gram)
score_metrics(y_test, pred2)

Results for 1gram with C=5 and max_iter = 400
Precision: 0.7616544879059044
Recall:  0.7085692474075959
F1 score:  0.7341534955766795
Accuracy:  0.6577878787878788
Micro-averaged F1-score:  0.6577878787878788


In [19]:
# 1gram+2gram, C=0.5, max_iter = 400
tfidf_1gram2gram = TfidfVectorizer(min_df = 100, max_features = 100, ngram_range=(1,2), sublinear_tf = True, stop_words = 'english')
train_x_1gram2gram, test_x_1gram2gram = tfidf_transform(tfidf_1gram2gram, X_train, X_test)

print("Results for 1gram2gram with C=0.5 and max_iter = 400")
lr3 = LogisticRegression(C=0.5, random_state=42, max_iter=400)
model3 = lr3.fit(train_x_1gram2gram, y_train)
pred3 = model3.predict(test_x_1gram2gram)
score_metrics(y_test, pred3)

Results for 1gram2gram with C=0.5 and max_iter = 400
Precision: 0.8275998520391924
Recall:  0.8743376986903929
F1 score:  0.8503270284603147
Accuracy:  0.794739393939394
Micro-averaged F1-score:  0.794739393939394


In [20]:
# 1gram+2gram, C=5, max_iter = 100
print("Results for 1gram2gram with C=5 and max_iter = 100")
lr4 = LogisticRegression(C=5, random_state=42, max_iter=100)
model4 = lr4.fit(train_x_1gram2gram, y_train)
pred4 = model4.predict(test_x_1gram2gram)
score_metrics(y_test, pred4)

Results for 1gram2gram with C=5 and max_iter = 100
Precision: 0.8278435896552911
Recall:  0.8739105541064953
F1 score:  0.8502535490231621
Accuracy:  0.7947212121212122
Micro-averaged F1-score:  0.7947212121212122


# SVM

In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.svm import LinearSVC
import pickle

In [22]:
def split_data(data):
    X_train, X_test, y_train, y_test = train_test_split(data['text'], data['binary_label'], test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

In [23]:
def tfidf_transform(vectorizer, train, test):
    features_train = vectorizer.fit_transform(train)
    features_test = vectorizer.fit_transform(test)
    return features_train, features_test

In [24]:
def score_metrics(test, pred):
    print("Precision:", precision_score(test, pred))
    print("Recall: ", recall_score(test, pred))
    print("F1 score: ", f1_score(test, pred))
    print("Accuracy: ", accuracy_score(test, pred))
    print("Micro-averaged F1-score: ", f1_score(test, pred, average = 'micro'))

In [25]:
# main

# Read data in from CSV
data = pd.read_csv("cleaned_reviews.csv")
data = data.drop('Unnamed: 0', axis = 1)

# Split data into test and train sets
X_train, X_test, y_train, y_test = split_data(data)

In [26]:
# 1gram, C=0.5, loss=hinge
tfidf_1gram = TfidfVectorizer(min_df = 100, max_features = 300, ngram_range=(1,1), sublinear_tf = True, stop_words = 'english')
train_x_1gram, test_x_1gram = tfidf_transform(tfidf_1gram, X_train, X_test)

print("Results for 1gram with C=0.5 and loss=hinge")
svm1 = LinearSVC(C=0.5, loss = 'hinge', random_state=42)
model1 = svm1.fit(train_x_1gram, y_train)
pred1 = model1.predict(test_x_1gram)
score_metrics(y_test, pred1)

Results for 1gram with C=0.5 and loss=hinge




Precision: 0.7600279552322549
Recall:  0.7017167576999628
F1 score:  0.7297092957320531
Accuracy:  0.6533333333333333
Micro-averaged F1-score:  0.6533333333333333


In [27]:
# 1gram, C=0.5, loss=squared_hinge

print("Results for 1gram with C=0.5 and loss=squared_hinge")
svm2 = LinearSVC(C=0.5, loss = 'squared_hinge', random_state=42)
model2 = svm2.fit(train_x_1gram, y_train)
pred2 = model2.predict(test_x_1gram)
score_metrics(y_test, pred2)

Results for 1gram with C=0.5 and loss=squared_hinge
Precision: 0.7617140959784371
Recall:  0.7088600692519517
F1 score:  0.7343372671596895
Accuracy:  0.657969696969697
Micro-averaged F1-score:  0.657969696969697


In [28]:
# 1gram+2gram, C=0.5, loss=hinge
tfidf_1gram2gram = TfidfVectorizer(min_df = 100, max_features = 100, ngram_range=(1,2), sublinear_tf = True, stop_words = 'english')
train_x_1gram2gram, test_x_1gram2gram = tfidf_transform(tfidf_1gram2gram, X_train, X_test)

print("Results for 1gram2gram with C=0.5 and loss=hinge")
svm3 = LinearSVC(C=0.5, loss = 'hinge', random_state=42)
model3 = svm3.fit(train_x_1gram2gram, y_train)
pred3 = model3.predict(test_x_1gram2gram)
score_metrics(y_test, pred3)

Results for 1gram2gram with C=0.5 and loss=hinge




Precision: 0.8282997088365522
Recall:  0.8738651131933147
F1 score:  0.8504725387959438
Accuracy:  0.7950848484848485
Micro-averaged F1-score:  0.7950848484848485


In [29]:
# 1gram+2gram, C=5, loss=squared_hinge
print("Results for 1gram2gram with C=5 and loss=squared_hinge")
svm4 = LinearSVC(C=5, loss = 'squared_hinge', random_state=42)
model4 = svm4.fit(train_x_1gram2gram, y_train)
pred4 = model4.predict(test_x_1gram2gram)
score_metrics(y_test, pred4)

Results for 1gram2gram with C=5 and loss=squared_hinge
Precision: 0.826364657367478
Recall:  0.8759917479301664
F1 score:  0.8504548382258221
Accuracy:  0.7945575757575758
Micro-averaged F1-score:  0.7945575757575758


In [31]:
# Save the best SVM model for Problem 4
with open('best_model_svm.pkl', 'wb') as f:
    pickle.dump(model3, f)
    
with open('tfidf_1gram2gram.pkl', 'wb') as f:
    pickle.dump(tfidf_1gram2gram, f)

### fasttext - need to fix problem with crashing

In [32]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# import fasttext

In [33]:
# def split_data(data):
#     X_train, X_test, y_train, y_test = train_test_split(data['transformed_label'], data['binary_label'], test_size=0.33, random_state=42)
#     return X_train, X_test, y_train, y_test

In [34]:
# def score_metrics(test, pred):
#     print("Precision:", precision_score(test, pred))
#     print("Recall: ", recall_score(test, pred))
#     print("F1 score: ", f1_score(test, pred))
#     print("Accuracy: ", accuracy_score(test, pred))
#     print("Micro-averaged F1-score: ", f1_score(test, pred, average = 'micro'))

In [35]:
# # Read data in from CSV
# data = pd.read_csv("cleaned_reviews.csv")
# data = data.drop('Unnamed: 0', axis = 1)

In [36]:
# # Reference: https://medium.com/@ravindraprasad/build-your-own-text-classification-in-less-than-25-lines-of-code-using-fasttext-dae7229f80f9
    
# # Each line of the text file contains a list of labels, followed by the corresponding document. 
# # All the labels start by the __label__ prefix, which is how fastText recognize what is a label or what is a word. 
# # e.g. __label__1 document_x
# data['transformed_label'] = data.apply(lambda i: '__label__' + str(i['binary_label']) + ' ' + str(i['text']), axis = 1)

# # Split data into test and train sets
# X_train, X_test, y_train, y_test = split_data(data)

In [37]:
# model = fasttext.train_supervised(X_train.tolist())

# Best Prediction

In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.svm import LinearSVC
from nltk import word_tokenize
import json
from nltk.corpus import stopwords

In [39]:
# Extract stars, text, and word length of documents after applying tokenization
def pre_process(reviews):
    star_ls = []
    text_ls = []
    doc_len_ls = []
    for review in reviews:
        dat = json.loads(review)
        star = dat['stars']
        text = dat['text']
        star_ls.append(star)
        text_ls.append(text)
        # apply nltk word tokenization
        doc_len_ls.append(len(word_tokenize(text)))
    return star_ls, text_ls, doc_len_ls

# Apply nltk word tokenization, convert to lower cases, remove non-alphabetic chars and stopwords
def tokenization(text, stopwords):
    review_tokenized = [i.lower() for i in word_tokenize(text) if i.isalpha() and i not in stopwords]
    return review_tokenized

# Clean reviews and save into CSV file
def cleaning(stars, texts):
    # List of stopwords that do not add much meaning to a sentence
    stop_words = set(stopwords.words('english'))
    texts_tokenized = [tokenization(i, stop_words) for i in texts]
    cleaned_df = pd.DataFrame({'star': stars, 'text': texts_tokenized})
    # Create binary lables: 1 for ratings > 3, 0 for ratings <= 3. This variable is created to apply logistic regression on multi-label class.
    cleaned_df['binary_label'] = cleaned_df.apply(lambda x: 1 if x['star'] > 3 else 0, axis=1)
    # Remove NAs
    cleaned_df = cleaned_df.dropna()
    return cleaned_df


In [41]:
# main

with open('tfidf_1gram2gram.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

with open('best_model_svm.pkl', 'rb') as f:
    best_svm_model = pickle.load(f)


In [42]:
# Read in next 50,000 lines of Yelp reviews for prediction
lines = open('yelp_academic_dataset_review.json', encoding="utf8").readlines()[600000:650000]
star_ls, text_ls, doc_len_ls = pre_process(lines)
cleaned_df = cleaning(star_ls, text_ls)

In [45]:
features = tfidf_vectorizer.fit_transform(text_ls)

In [48]:
pred = best_svm_model.predict(features)

In [49]:
conf = best_svm_model.decision_function(features)

In [50]:
output = {
        'review': text_ls,
        'actual_label': cleaned_df['binary_label'].tolist(),
        'predicted_label': pred.tolist(),
        'confidence': conf.tolist()
}

In [51]:
with open('predcition_results.json', 'w') as f:
    json.dump(output, f)
print("Prediction results saved into json file.")

Prediction results saved into json file.
