In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import collections
import re, string
import sys
import time
import os


In [2]:
import json
import csv

def init_dataset(json) -> tuple[dict, list]:
    ds: dict = {}
    keys = json.keys()
    for k in keys:
        ds[k] = []
    return ds, keys

def read_json(file) -> pd.DataFrame:
    dataset = {}
    keys = []
    with open(file) as file_lines:
        for count, line in enumerate(file_lines):
            json_line = json.loads(line.strip())
            if count == 0:
                dataset, keys = init_dataset(json_line)
            for k in keys:
                dataset[k].append(json_line[k])
        return pd.DataFrame(dataset)

def read_csv(file) -> pd.DataFrame:
    dataset = {}
    with open(file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        keys = reader.fieldnames
        for k in keys:
            dataset[k] = []
        for row in reader:
            for k in keys:
                dataset[k].append(row[k])
    return pd.DataFrame(dataset)


In [3]:
#yelp_review = read_json('data/yelp_academic_dataset_review.json')
yelp_review = read_csv('data/yelp_academic_dataset_review.csv')

In [4]:
#yelp_business = read_json('data/yelp_academic_dataset_business.json')
yelp_business = read_csv('data/yelp_academic_dataset_business.csv')

In [5]:
# Sample Data: Restaurants reviewed by karen, the user with the most reviews
# Businesses that are categorized as restaurants
business_restaurant = yelp_business.loc[yelp_business['categories'].str.contains('Restaurant', na=False)]
# Reviews of Restaurant businesses
review_restaurant = yelp_review[yelp_review['business_id'].isin(business_restaurant['business_id'])]
# User with most restaurant reviews
karen = review_restaurant['user_id'].value_counts().index[0]
# Reviews Karen has made of restaurant businesses
review_restaurant_karen = review_restaurant.loc[review_restaurant['user_id'] == karen]
# Restaurant businesses that Karen has reviewed
business_restaurant_karen = business_restaurant[business_restaurant['business_id'].isin(review_restaurant_karen['business_id'])]

In [6]:
## Clean Data: remove missing rows and irrelevant columns
df = business_restaurant_karen.set_index('business_id')

# Remove columns with greater than 20% missing fields
mask = df.applymap(lambda x: x =='' or x == 'None').sum()
features = ((mask/len(df)) * 100).map(lambda x: x < 20)


# Remove non-attribute columns (except business_id)
features.loc[~features.index.str.contains('attributes.')] = False
#features.loc['business_id'] = True
dataset = df.loc[:, features]

# Remove rows with missing data
mask = dataset.applymap(lambda x: x == '' or x == 'None')
dataset = dataset.loc[~mask.any(axis=1)]

# Remove all non-boolean columns
mask = dataset.applymap(lambda x : x == 'True' or x == 'False').sum() != 0
#mask.loc['business_id'] = True
#dataset = dataset.set_index('business_id')
dataset = dataset.loc[:, mask].applymap(lambda x: x == 'True')

In [16]:
# Transform Data: add targets
df = review_restaurant_karen.set_index('business_id')
df = df.loc[df.index.intersection(dataset.index)]
df = df.astype({'stars':'float'})
dataset['target'] = df.groupby(df.index)['stars'].mean().map(lambda x: x > 3)

In [100]:
import pandas as pd
import numpy as np

# Principal Component Analysis

# Convert feature set to numpy array
lamb = lambda x: 1. if x == True else 0.
labels = dataset['target'].map(lamb).to_numpy()
feature_set = dataset.drop(['target'], axis=1).applymap(lamb).to_numpy()

# Calculate eigenvectors of covariance matrix
C = np.cov(feature_set.T)
evals, evecs = np.linalg.eig(C)
pcts = 100 * evals / np.sum(evals)

# Select Feature Matrix
sortidx = np.argsort(pcts)
sorted_evecs = evecs[sortidx[::-1]]
feature_vec = sorted_evecs[:6]

X_pca = feature_vec @ feature_set.T
feature_vec.shape, np.sum(pcts[sortidx[::-1]][:6]), X_pca.shape
tmp_df = pd.DataFrame(X_pca.T)
tmp_df['target'] = labels
tmp_df

Unnamed: 0,0,1,2,3,4,5,target
0,1.210467,0.262052,-0.367110,1.045168,-0.385329,-0.685348,0.0
1,1.418842,0.518833,-1.023205,1.438368,0.205112,-0.539704,0.0
2,0.623524,0.588470,-0.410550,1.551780,0.218564,-0.442690,1.0
3,1.337595,0.894654,-1.002048,0.903907,0.279202,-0.834362,1.0
4,1.294439,0.635720,-1.114369,1.191932,-0.393770,-0.703637,0.0
...,...,...,...,...,...,...,...
681,1.334870,0.145165,-0.275946,1.291604,0.213553,-0.521416,0.0
682,0.593154,-0.358526,0.218769,2.040615,0.433516,-0.190422,1.0
683,1.294439,0.635720,-1.114369,1.191932,-0.393770,-0.703637,1.0
684,1.298512,0.183781,-1.269311,1.519797,-0.476358,0.360044,0.0


In [90]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

lamb = lambda x: 1. if x == True else 0.
labels = dataset['target'].map(lamb).to_numpy()
feature_set = dataset.drop(['target'], axis=1).applymap(lamb).to_numpy()

# Standardize the data
scaler = StandardScaler()
#X_scaled = scaler.fit_transform(feature_set)
X_scaled = feature_set

# Select the top n principal components
n_components = 6
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

tmp_df = pd.DataFrame(X_pca)
tmp_df['target'] = labels
pca.explained_variance_ratio_, X_pca.shape

(array([0.19474786, 0.17837407, 0.12973571, 0.12037835, 0.11731966,
        0.09319146]),
 (686, 6))

In [102]:
# Delineate between traning and testing set
train_df, test_df = train_test_split(tmp_df, test_size=0.2, random_state=42)
y_train = train_df['target'].to_numpy()
x_train = train_df.drop(['target'], axis=1).to_numpy()

y_test = test_df['target'].to_numpy()
x_test = test_df.drop(['target'], axis=1).to_numpy()

### Logistic Regression Classifier using Newton's Method

In [10]:
# hypothesis function... sigmoid function
def g(theta, x):
    return 1 / (1 + np.exp(-x @ theta))

# matrix derivative
def dJ(theta, x, y):
    m, _ = x.shape
    return 1/m* x.T @ (g(theta, x) - y)

# hessian matrix
def HJ(theta, x):
    m, _ = x.shape
    Z = g(theta, x)
    Z = Z*(1-Z)
    return 1/m * Z * x.T @ x

# distance between two vectors
def dist(x, y):
    return np.sum(np.abs(x-y))

In [11]:
class LogisticRegression(object):
    def __init__(self, step_size=0.2, max_iter=100, eps=1e-5,
                theta_0=None, verbose=True):
        self.theta = theta_0
        self.step_size = step_size
        self.max_iter = max_iter
        self.eps = eps
        self.verbose = verbose

    def fit(self, x, y):
        m, n = x.shape
        if self.theta is None:
            self.theta=np.zeros(n)
        for i in range(self.max_iter):
            theta_new = self.theta - np.linalg.inv(HJ(self.theta, x)) @ dJ(self.theta, x, y)
            if dist(theta_new, self.theta) < self.eps:
                self.theta = theta_new
                break
            else:
                self.theta = theta_new

    def predict(self, x):
        return x @ self.theta >= 0

In [103]:
lg = LogisticRegression()
lg.fit(x_train, y_train)
print("Theta: ", lg.theta)
print("Training accuracy: ", np.mean(lg.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(lg.predict(x_test) == y_test))

Theta:  [-0.53279372  0.48695175  0.17197836  0.72416349  0.17035942  0.07462808]
Training accuracy:  0.5894160583941606
Testing accuracy:   0.5869565217391305


### Naive Bayes Model

In [13]:
'''
Naive Bayes Classifier
'''
class NaiveBayes:
    '''
    Naive Bayes Classifier (Bernoulli event model)

    During training, the classifier learns probabilities by counting the
    occurences of feature/label combinations that it finds in the
    training data. During prediction, it uses these counts to
    compute probabilities.
    '''

    def __init__(self, use_laplace_add_one):
        self.label_counts = {}
        self.feature_counts = {}
        self.use_laplace_add_one = use_laplace_add_one # True for Laplace add-one smoothing

    def fit(self, train_features, train_labels):
        '''Training stage - learn from data'''

        self.label_counts[0] = 0
        self.label_counts[1] = 0

        ### YOUR CODE HERE (~5-10 Lines)
        self.label_counts[0] = np.count_nonzero(train_labels == 0)
        self.label_counts[1] = np.count_nonzero(train_labels == 1)

        for row, sample in enumerate(train_features):
            label = train_labels[row]
            for feature, feature_value in enumerate(sample):
                key = (feature, feature_value, label)
                self.feature_counts[key] = self.feature_counts.get(key, 0) + 1
        ### END YOUR CODE

    def predict(self, test_features):
        '''Testing stage - classify new data'''

        preds = np.zeros(test_features.shape[0], dtype=np.uint8)

        tot = self.label_counts[0] + self.label_counts[1]
        ### YOUR CODE HERE (~10-30 Lines)
        p_y0 = self.label_counts[0] / tot
        p_y1 = self.label_counts[1] / tot
        for row, sample in enumerate(test_features):
            p_y0_mid_x = p_y0
            p_y1_mid_x = p_y1
            for feature, feature_value in enumerate(sample):
                #calc prob sample 0
                xi = (feature, feature_value, 0)
                c_xi_and_y0 = self.feature_counts.get(xi, 0)

                #calc prob sample 1 
                xi = (feature, feature_value, 1)
                c_xi_and_y1 = self.feature_counts.get(xi, 0)

                if (self.use_laplace_add_one):
                    p_y0_mid_x *= (c_xi_and_y0 + 1) / (self.label_counts[0] + 2)
                    p_y1_mid_x *= (c_xi_and_y1 + 1) / (self.label_counts[1] + 2)
                else:
                    p_y0_mid_x *= (c_xi_and_y0 / self.label_counts[0])
                    p_y1_mid_x *= (c_xi_and_y1 / self.label_counts[1])

            #calculate argmax
            if (p_y0_mid_x > p_y1_mid_x):
                preds[row] = 0
            else:
                preds[row] = 1 
        ### END YOUR CODE

        return preds


In [14]:
nb = NaiveBayes(True)
nb.fit(x_train, y_train)
print("Training accuracy: ", np.mean(nb.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(nb.predict(x_test) == y_test))

Training accuracy:  0.7208029197080292
Testing accuracy:   0.5362318840579711


In [104]:
nb = NaiveBayes(True)
nb.fit(x_train, y_train)
print("Training accuracy: ", np.mean(nb.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(nb.predict(x_test) == y_test))

Training accuracy:  0.7135036496350365
Testing accuracy:   0.5434782608695652


### scikit-learn models

In [105]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(x_train, y_train)
print("")
print("Training accuracy: ", np.mean(tree_clf.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(tree_clf.predict(x_test) == y_test))

Training accuracy:  0.7171532846715328
Testing accuracy:   0.5652173913043478


In [None]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(x_train, y_train)
print("Training accuracy: ", np.mean(tree_clf.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(tree_clf.predict(x_test) == y_test))