In [81]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import numpy
import random
import gzip
import math

In [82]:
import warnings
warnings.filterwarnings("ignore")

In [83]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [36]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [None]:
len(dataset)

In [38]:
answers = {} # Put your answers to each question in this dictionary

In [None]:
dataset[0]

In [40]:
### Question 1

In [41]:
def feature(datum):
    # your implementation
    return[1, datum['review_text'].count('!')]

In [42]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [43]:
theta, residual, rank, s = numpy.linalg.lstsq(X, Y)

y_pred = X @ theta
theta0, theta1, mse = theta[0], theta[1], numpy.mean((Y - y_pred)**2)

In [44]:
answers['Q1'] = [theta0, theta1, mse]
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

In [45]:
### Question 2

In [46]:
def feature(datum):
    return [1, datum['review_text'].count('!'), len(datum['review_text'])]

In [47]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [48]:
theta, residual, rank, s = numpy.linalg.lstsq(X, Y)

y_pred = X @ theta
theta0, theta1, theta2, mse = theta[0], theta[1], theta[2], numpy.mean((Y - y_pred)**2)

In [49]:
answers['Q2'] = [theta0, theta1, theta2, mse]
assertFloatList(answers['Q2'], 4)


In [50]:
### Question 3

In [51]:
def feature(datum, deg):
    # feature for a specific polynomial degree
    features = [1]
    for d in range(1, deg+1):
        features.append(datum['review_text'].count('!') ** d)
    return features

In [52]:
mses = []
for i in range(1, 6):
    X = [feature(d, i) for d in dataset]
    Y = [d['rating'] for d in dataset]
    theta = numpy.linalg.lstsq(X, Y)[0]
    y_pred = X @ theta
    mses.append(numpy.mean((Y - y_pred)**2))

In [53]:
answers['Q3'] = mses
assertFloatList(answers['Q3'], 5)

In [54]:
### Question 4

In [55]:
def feature(datum, deg):
    # feature for a specific polynomial degree
    features = [1]
    for d in range(1, deg+1):
        features.append(datum['review_text'].count('!') ** d)
    return features

In [56]:
mses = []
for i in range(1, 6):
    X = [feature(d, i) for d in dataset]
    X_train = X[:len(X)//2] # first half for training
    X_test = X[len(X)//2:] # second half for test
    Y = [d['rating'] for d in dataset]
    Y_train = Y[:len(Y)//2] # first half for training
    Y_test = Y[len(Y)//2:] # second half for tes
    
    theta = numpy.linalg.lstsq(X_train, Y_train)[0]
    y_pred = X_test @ theta
    mses.append(numpy.mean((Y_test - y_pred)**2))

In [None]:
answers['Q4'] = mses
assertFloatList(answers['Q4'], 5)
mses

In [58]:
### Question 5

In [67]:
X = [1]*10000
Y = [d['rating'] for d in dataset]
theta = numpy.median(Y)

4.0

In [63]:
mae = numpy.mean(abs(Y - theta))

In [68]:
answers['Q5'] = mae
assertFloat(answers['Q5'])

0.8923

In [None]:
### Question 6

In [69]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [71]:
len(dataset)
dataset[0]

{'review/appearance': 4.0,
 'beer/style': 'American Double / Imperial IPA',
 'review/palate': 4.0,
 'review/taste': 4.5,
 'beer/name': 'Cauldron DIPA',
 'review/timeUnix': 1293735206,
 'user/gender': 'Male',
 'user/birthdayRaw': 'Jun 16, 1901',
 'beer/ABV': 7.7,
 'beer/beerId': '64883',
 'user/birthdayUnix': -2163081600,
 'beer/brewerId': '1075',
 'review/timeStruct': {'isdst': 0,
  'mday': 30,
  'hour': 18,
  'min': 53,
  'sec': 26,
  'mon': 12,
  'year': 2010,
  'yday': 364,
  'wday': 3},
 'user/ageInSeconds': 3581417047,
 'review/overall': 4.0,
 'review/text': "According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.\t\tThe beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing

In [72]:
def feature(datum):
    return [1, datum['review/text'].count('!')]

In [75]:
X = [feature(d) for d in dataset if d['user/gender'] in ['Male', 'Female']]
y = [1 if d['user/gender'] == 'Female' else 0 for d in dataset if d['user/gender'] in ['Male', 'Female']]


In [77]:
model = linear_model.LogisticRegression()
model.fit(X, y)

y_pred = model.predict(X)


In [85]:
TN, FP, FN, TP = confusion_matrix(y, y_pred).ravel()

# Calculate BER
total_male = TN + FP
total_female = TP + FN
BER = 0.5 * (FP / total_male + FN / total_female)

In [86]:
answers['Q6'] = [TP, TN, FP, FN, BER]
assertFloatList(answers['Q6'], 5)

In [None]:
### Question 7

In [87]:
balanced_model = linear_model.LogisticRegression(class_weight='balanced')
balanced_model.fit(X, y)

y_pred = balanced_model.predict(X)


In [88]:
TN, FP, FN, TP = confusion_matrix(y, y_pred).ravel()
BER = 0.5 * (FP / (TN + FP) + FN / (TP + FN))

In [89]:
answers["Q7"] = [TP, TN, FP, FN, BER]
assertFloatList(answers['Q7'], 5)

In [None]:
### Question 8

In [90]:
def precision_at_k(y_true, y_scores, k):
    sorted_indices = sorted(range(len(y_scores)), key=lambda i: y_scores[i], reverse=True)
    top_k = sorted_indices[:k]
    # Count true positives
    true_positives = sum([y_true[i] for i in top_k])
    return true_positives / k

y_scores = balanced_model.decision_function(X)

# Calculate Precision@K for K ∈ [1, 10, 100, 1000, 10000]
k_values = [1, 10, 100, 1000, 10000]
precisionList = []
for k in k_values:
    if k <= len(y):
        precisionList.append(precision_at_k(y, y_scores, k))
    else:
        precisionList.append(0.0)

In [91]:
answers['Q8'] = precisionList
assertFloatList(answers['Q8'], 5) #List of five floats

In [92]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()