In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import collections
import re, string
import sys
import time
import os


In [2]:
import json
import csv

def init_dataset(json) -> tuple[dict, list]:
    ds: dict = {}
    keys = json.keys()
    for k in keys:
        ds[k] = []
    return ds, keys

def read_json(file) -> pd.DataFrame:
    dataset = {}
    keys = []
    with open(file) as file_lines:
        for count, line in enumerate(file_lines):
            json_line = json.loads(line.strip())
            if count == 0:
                dataset, keys = init_dataset(json_line)
            for k in keys:
                dataset[k].append(json_line[k])
        return pd.DataFrame(dataset)

def read_csv(file) -> pd.DataFrame:
    dataset = {}
    with open(file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        keys = reader.fieldnames
        for k in keys:
            dataset[k] = []
        for row in reader:
            for k in keys:
                dataset[k].append(row[k])
    return pd.DataFrame(dataset)


In [3]:
#yelp_review = read_json('data/yelp_academic_dataset_review.json')
yelp_review = read_csv('data/yelp_academic_dataset_review.csv')

In [4]:
#yelp_business = read_json('data/yelp_academic_dataset_business.json')
yelp_business = read_csv('data/yelp_academic_dataset_business.csv')

In [5]:
# Sample Data: Restaurants reviewed by karen, the user with the most reviews
# Businesses that are categorized as restaurants
business_restaurant = yelp_business.loc[yelp_business['categories'].str.contains('Restaurant', na=False)]
# Reviews of Restaurant businesses
review_restaurant = yelp_review[yelp_review['business_id'].isin(business_restaurant['business_id'])]
# User with most restaurant reviews
karen = review_restaurant['user_id'].value_counts().index[0]
# Reviews Karen has made of restaurant businesses
review_restaurant_karen = review_restaurant.loc[review_restaurant['user_id'] == karen]
# Restaurant businesses that Karen has reviewed
business_restaurant_karen = business_restaurant[business_restaurant['business_id'].isin(review_restaurant_karen['business_id'])]

In [6]:
## Clean Data: remove missing rows and irrelevant columns
df = business_restaurant_karen.set_index('business_id')

# Remove columns with greater than 20% missing fields
mask = df.applymap(lambda x: x =='' or x == 'None').sum()
features = ((mask/len(df)) * 100).map(lambda x: x < 20)


# Remove non-attribute columns (except business_id)
features.loc[~features.index.str.contains('attributes.')] = False
#features.loc['business_id'] = True
dataset = df.loc[:, features]

# Remove rows with missing data
mask = dataset.applymap(lambda x: x == '' or x == 'None')
dataset = dataset.loc[~mask.any(axis=1)]

# Remove all non-boolean columns
mask = dataset.applymap(lambda x : x == 'True' or x == 'False').sum() != 0
#mask.loc['business_id'] = True
#dataset = dataset.set_index('business_id')
dataset = dataset.loc[:, mask].applymap(lambda x: x == 'True')

In [7]:
# Transform Data: add targets
df = review_restaurant_karen.set_index('business_id')
df = df.loc[df.index.intersection(dataset.index)]
df = df.astype({'stars':'float'})
dataset['target'] = df.groupby(df.index)['stars'].mean().map(lambda x: x > 3)

In [8]:
# Delineate between traning and testing set
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

### Logistic Regression Classifier using Newton's Method

In [9]:
import util

lamb = lambda x: 1. if x == True else 0.
y_train = train_df['target'].map(lamb).to_numpy()
x_train = train_df.drop(['target'], axis=1).applymap(lamb).to_numpy()

y_test = test_df['target'].map(lamb).to_numpy()
x_test = test_df.drop(['target'], axis=1).applymap(lamb).to_numpy()

In [10]:
# hypothesis function... sigmoid function
def g(theta, x):
    return 1 / (1 + np.exp(-x @ theta))

# matrix derivative
def dJ(theta, x, y):
    m, _ = x.shape
    return 1/m* x.T @ (g(theta, x) - y)

# hessian matrix
def HJ(theta, x):
    m, _ = x.shape
    Z = g(theta, x)
    Z = Z*(1-Z)
    return 1/m * Z * x.T @ x

# distance between two vectors
def dist(x, y):
    return np.sum(np.abs(x-y))

In [11]:
class LogisticRegression(object):
    def __init__(self, step_size=0.2, max_iter=100, eps=1e-5,
                theta_0=None, verbose=True):
        self.theta = theta_0
        self.step_size = step_size
        self.max_iter = max_iter
        self.eps = eps
        self.verbose = verbose

    def fit(self, x, y):
        m, n = x.shape
        if self.theta is None:
            self.theta=np.zeros(n)
        for i in range(self.max_iter):
            theta_new = self.theta - np.linalg.inv(HJ(self.theta, x)) @ dJ(self.theta, x, y)
            if dist(theta_new, self.theta) < self.eps:
                self.theta = theta_new
                break
            else:
                self.theta = theta_new

    def predict(self, x):
        return x @ self.theta >= 0

In [12]:
lg = LogisticRegression()
lg.fit(x_train, y_train)
print("Theta: ", lg.theta)
print("Training accuracy: ", np.mean(lg.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(lg.predict(x_test) == y_test))

Theta:  [ 0.59714209  0.49508892 -0.042109   -0.08595731  0.51446855 -0.48017102
  0.13016552 -0.64661731 -0.19983955  0.52538337]
Training accuracy:  0.6076642335766423
Testing accuracy:   0.6014492753623188


### Naive Bayes Model

In [13]:
'''
Naive Bayes Classifier
'''
class NaiveBayes:
    '''
    Naive Bayes Classifier (Bernoulli event model)

    During training, the classifier learns probabilities by counting the
    occurences of feature/label combinations that it finds in the
    training data. During prediction, it uses these counts to
    compute probabilities.
    '''

    def __init__(self, use_laplace_add_one):
        self.label_counts = {}
        self.feature_counts = {}
        self.use_laplace_add_one = use_laplace_add_one # True for Laplace add-one smoothing

    def fit(self, train_features, train_labels):
        '''Training stage - learn from data'''

        self.label_counts[0] = 0
        self.label_counts[1] = 0

        ### YOUR CODE HERE (~5-10 Lines)
        self.label_counts[0] = np.count_nonzero(train_labels == 0)
        self.label_counts[1] = np.count_nonzero(train_labels == 1)

        for row, sample in enumerate(train_features):
            label = train_labels[row]
            for feature, feature_value in enumerate(sample):
                key = (feature, feature_value, label)
                self.feature_counts[key] = self.feature_counts.get(key, 0) + 1
        ### END YOUR CODE

    def predict(self, test_features):
        '''Testing stage - classify new data'''

        preds = np.zeros(test_features.shape[0], dtype=np.uint8)

        tot = self.label_counts[0] + self.label_counts[1]
        ### YOUR CODE HERE (~10-30 Lines)
        p_y0 = self.label_counts[0] / tot
        p_y1 = self.label_counts[1] / tot
        for row, sample in enumerate(test_features):
            p_y0_mid_x = p_y0
            p_y1_mid_x = p_y1
            for feature, feature_value in enumerate(sample):
                #calc prob sample 0
                xi = (feature, feature_value, 0)
                c_xi_and_y0 = self.feature_counts.get(xi, 0)

                #calc prob sample 1 
                xi = (feature, feature_value, 1)
                c_xi_and_y1 = self.feature_counts.get(xi, 0)

                if (self.use_laplace_add_one):
                    p_y0_mid_x *= (c_xi_and_y0 + 1) / (self.label_counts[0] + 2)
                    p_y1_mid_x *= (c_xi_and_y1 + 1) / (self.label_counts[1] + 2)
                else:
                    p_y0_mid_x *= (c_xi_and_y0 / self.label_counts[0])
                    p_y1_mid_x *= (c_xi_and_y1 / self.label_counts[1])

            #calculate argmax
            if (p_y0_mid_x > p_y1_mid_x):
                preds[row] = 0
            else:
                preds[row] = 1 
        ### END YOUR CODE

        return preds


In [17]:
nb = NaiveBayes(True)
nb.fit(x_train, y_train)
print("Training accuracy: ", np.mean(nb.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(nb.predict(x_test) == y_test))

Training accuracy:  0.6113138686131386
Testing accuracy:   0.6014492753623188


### Logistic Regression using gradient ascent

In [19]:
'''
Logistic Regression Classifier
'''
class LogisticRegression2:
    '''
    Logistic Regression Classifier

    During training, Logistic Regression learns weights for each
    feature using gradient ascent. During prediction, it uses
    the test data to apply a linear transformation to the weights,
    obtaining a probability for each example in the test data.
    '''

    def __init__(self, learning_rate, max_steps):
        self.learning_rate = learning_rate
        self.max_steps = max_steps
        self.weights = None

    def fit(self, train_features, train_labels):
        '''Training stage - learn from data'''

        # This line inserts a column of ones before the first column of train_features,
        # resulting in the an `n x (d + 1)` size matrix, This is so we
        # don't need to have a special case for the bias weight.
        train_features = np.insert(train_features, 0, 1, axis=1)

        # This makes the matrix immutable
        train_features.setflags(write=False)

        # This is the theta you will be performing gradient ascent on. It has
        # shape (d + 1).
        theta = np.zeros(train_features.shape[1])

        ### YOUR CODE HERE (~3-10 Lines)
        for _ in range(self.max_steps):
            gradients = np.zeros(train_features.shape[1])
            for y, x in enumerate(train_features):
                gradients += x * (train_labels[y] - sigmoid(theta @ x))
            theta += self.learning_rate * gradients
        ### END YOUR CODE

        self.weights = theta

    def predict(self, test_features):
        '''Testing stage - classify new data'''

        test_features = np.insert(test_features, 0, 1, axis=1) # add bias term
        test_features.setflags(write=False) # make immutable
        preds = np.zeros(test_features.shape[0], np.uint8)

        ### YOUR CODE HERE (~1-7 Lines)
        preds = np.where(sigmoid(test_features @ self.weights) > 0.5, 1, 0)
        ### END YOUR CODE

        return preds

def sigmoid(vec):
    '''Numerically stable implementation of the sigmoid function'''
    positive_mask = vec >= 0
    negative_mask = vec < 0
    exp = np.zeros_like(vec, dtype=np.float64)
    exp[positive_mask] = np.exp(-vec[positive_mask])
    exp[negative_mask] = np.exp(vec[negative_mask])
    top = np.ones_like(vec, dtype=np.float64)
    top[negative_mask] = exp[negative_mask]
    return top / (1 + exp)



In [22]:
lg = LogisticRegression2(0.2, 100)
lg.fit(x_train, y_train)
#print("Theta: ", lg.theta)
print("Training accuracy: ", np.mean(lg.predict(x_train) == y_train))
print("Testing accuracy:  ", np.mean(lg.predict(x_test) == y_test))

Training accuracy:  0.4397810218978102
Testing accuracy:   0.43478260869565216


In [None]:
os.makedirs('data_host/SAMPLES', exist_ok=True)
# review_restaurant_karen.to_csv('data_host/SAMPLES/review_restaurant_karen.csv')
# business_restaurant_karen.to_csv('data_host/SAMPLES/business_restaurant_karen.csv')
dataset = dataset.applymap(lambda x: 1 if x == True else 0)
dataset.to_csv('data_host/SAMPLES/dataset.csv')