In [None]:

import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [None]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [None]:
len(dataset)

10000

In [None]:
answers = {} # Put your answers to each question in this dictionary

In [None]:
dataset[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

# Question 1

In [None]:
# Vectors of the features,
#  -- count of the "!" character.
def feature(datum):
  # review_text from datum
  text = datum['review_text']

  # count "!"
  count = text.count('!')

  # return feature vector with first column is 1
  return [1, count]

In [None]:
# Matrix of the features
X = numpy.array([feature(datum = data) for data in dataset])

# Vector of the outputs
Y = numpy.array([data['rating'] for data in dataset])

$$
\theta = (X^T X)^{-1} X^T y
$$



In [None]:
# Calculate Thetas, with Psuedo Inverse of X. shape (,2)
theta = numpy.linalg.inv(X.T @ X) @ X.T @ Y

$$\text{MSE}: \frac{1}{N} \sum_{i=1}^{N} \left( X_i \cdot \theta - y_i \right)^2$$, $$X\theta = y$$



In [None]:
# Calculate MSE
mse = ((Y - X @ theta) ** 2).mean()

In [None]:
answers['Q1'] = [theta[0], theta[1], mse]
print(answers)

{'Q1': [3.6885330408320183, 0.07109019019954144, 1.5231747404538287]}


In [None]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

# Question 2

In [None]:
# Vectors of the features,
#  -- text length
#  -- count of the "!" character.
def feature(datum):
  # review_text from datum
  text = datum['review_text']

  #get text length
  length = len(text)

  # count "!"
  count = text.count('!')

  # return feature vector with first columns are [1, length, count]
  return [1, length, count]

In [None]:
# Matrix of Feature
X = numpy.array([feature(datum = data) for data in dataset])

# Vector of Outputs
Y = numpy.array([data['rating'] for data in dataset])

In [None]:
# calculate thetas shape (,3)
theta = numpy.linalg.inv(X.T @ X) @ X.T @ Y

In [None]:
# calculate MSE
mse = ((Y - X @ theta) ** 2).mean()

In [None]:
answers['Q2'] = [theta[0], theta[1], theta[2], mse]

In [None]:
assertFloatList(answers['Q2'], 4)

# Question 3

In [None]:
# Vectors of the features
# feature for a specific polynomial degree
def feature(datum, deg):
  # review_text from datum
  text = datum['review_text']

  # count "!"
  count = text.count('!')

  # initialize vector
  vector = [1]

  # append degrees
  for i in range(1, deg + 1):
    vector.append(count ** i)

  return vector

In [None]:
mses = []

In [None]:
# Fit polynomials from degree one to five
degrees = 5
for degree in range(1, degrees+1):
  # Matrix of Feature
  X = numpy.array([feature(datum = data,deg = degree) for data in dataset])

  # Vector of Outputs
  Y = numpy.array([data['rating'] for data in dataset])

  # calculate thetas shape (,degree+1)
  theta = numpy.linalg.inv(X.T @ X) @ X.T @ Y

  # calculate MSE
  mse = ((Y - X @ theta) ** 2).mean()

  # append
  mses.append(mse)

In [None]:
answers['Q3'] = mses

In [None]:
assertFloatList(answers['Q3'], 5)# List of length 5

# Questions 4

This question need functions in the questions 3

In [None]:
# split dataset into training and test withe 1:1 ratio
random.shuffle(dataset)
dataset_train = dataset[:len(dataset)//2]
dataset_test = dataset[len(dataset)//2:]

In [None]:
mses_test = []

In [None]:
# Fit polynomials from degree one to five
# train with dataset_train, output mse of dataset_test
degrees = 5
for degree in range(1, degrees+1):
  # Matrix of Feature
  X_train = numpy.array([feature(datum = data,deg = degree) for data in dataset_train])
  X_test = numpy.array([feature(datum = data,deg = degree) for data in dataset_test])

  # Vector of Outputs
  Y_train = numpy.array([data['rating'] for data in dataset_train])
  Y_test = numpy.array([data['rating'] for data in dataset_test])

  # calculate thetas shape (,degree+1) (or Fit)
  theta = numpy.linalg.inv(X_train.T @ X_train) @ X_train.T @ Y_train

  # calculate MSE for test set
  mse_test = ((Y_test - X_test @ theta) ** 2).mean()

  # append
  mses_test.append(mse_test)

In [None]:
answers['Q4'] = mses_test
answers['Q4']

[1.4938618031861384,
 1.4811373987988723,
 1.4766152128575782,
 1.4683987737840865,
 1.4673281360474502]

In [None]:
assertFloatList(answers['Q4'], 5)

# Question 5

In [None]:
# Vectors of the features
# trivial
def feature_trivial():
  # initialize vector
  vector = [1]

  return vector

In [None]:
# Matrix of feature
X = numpy.array([feature_trivial() for data in dataset])

# Vector of outputs
Y = numpy.array([data['rating'] for data in dataset])

In [None]:
# calculate theta
theta = numpy.linalg.inv(X.T @ X) @ X.T @ Y

In [None]:
# calculate mae
mae = abs(Y - X @ theta).mean()

In [None]:
answers['Q5'] = mae

In [None]:
assertFloat(answers['Q5'])

# Question 6

In [None]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [None]:
len(dataset)

20403

In [None]:
dataset[0]

{'review/appearance': 4.0,
 'beer/style': 'American Double / Imperial IPA',
 'review/palate': 4.0,
 'review/taste': 4.5,
 'beer/name': 'Cauldron DIPA',
 'review/timeUnix': 1293735206,
 'user/gender': 'Male',
 'user/birthdayRaw': 'Jun 16, 1901',
 'beer/ABV': 7.7,
 'beer/beerId': '64883',
 'user/birthdayUnix': -2163081600,
 'beer/brewerId': '1075',
 'review/timeStruct': {'isdst': 0,
  'mday': 30,
  'hour': 18,
  'min': 53,
  'sec': 26,
  'mon': 12,
  'year': 2010,
  'yday': 364,
  'wday': 3},
 'user/ageInSeconds': 3581417047,
 'review/overall': 4.0,
 'review/text': "According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.\t\tThe beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing

In [None]:
# vector of features
def feature(datum):
  # review_text from datum
  text = datum['review/text']

  # count "!"
  count = text.count('!')

  # return feature vector with first column is 1
  return [1, count]

In [None]:
# Matrix of features
X = numpy.array([feature(datum = data) for data in dataset])

# Vector of ouputs, use binary encoding but use numerical value 0 and 1, femal 0 male 1
Y = numpy.array([(1 if data['user/gender'] == 'Female' else 0) for data in dataset])

In [None]:
# Fit a logistic regressor
# use a logistic regression library with default parameters, e.g. linear model.LogisticRegression() from sklearn
model = linear_model.LogisticRegression()
model.fit(X,Y)

$$\text{Balanced Error Rate (BER)} = \frac{1}{2} (\text{FPR} + \text{FNR})
$$

In [None]:
# calculate TP
TP = ((Y ==1) & (model.predict(X) == 1)).sum()

# calculate TN
TN = ((Y ==0) & (model.predict(X) == 0)).sum()

# calculate FP
FP = ((Y ==0) & (model.predict(X) == 1)).sum()

# calculate FN
FN = ((Y ==1) & (model.predict(X) == 0)).sum()

# calculate FPR
FPR = FP / (FP + TN)

# calculate FNR
FNR = FN / (FN + TP)

# calculate BER
BER = 0.5 * (FPR + FNR)

# convert these value to standard Python types
TP = float(TP)
TN = float(TN)
FP = float(FP)
FN = float(FN)
BER = float(BER)

In [None]:
answers['Q6'] = [TP, TN, FP, FN, BER]

In [None]:
assertFloatList(answers['Q6'], 5)

In [None]:
print(answers)

{'Q1': [3.6885330408320183, 0.07109019019954144, 1.5231747404538287], 'Q2': [3.7175128077972004, -4.121506529487903e-05, 0.07527591733232622, 1.5214029246165832], 'Q3': [1.5231747404538285, 1.5046686106250917, 1.4966845515179226, 1.490447730223069, 1.4896106953961645], 'Q4': [1.4938618031861384, 1.4811373987988723, 1.4766152128575782, 1.4683987737840865, 1.4673281360474502], 'Q5': 0.9709800999999961, 'Q6': [0.0, 20095.0, 0.0, 308.0, 0.5]}


# Question 7

In [None]:
# Fit a logistic regressor
# use a logistic regression library with default parameters, e.g. linear model.LogisticRegression() from sklearn
model = linear_model.LogisticRegression(class_weight = 'balanced')
model.fit(X,Y)

In [None]:
# calculate TP
TP = ((Y ==1) & (model.predict(X) == 1)).sum()

# calculate TN
TN = ((Y ==0) & (model.predict(X) == 0)).sum()

# calculate FP
FP = ((Y ==0) & (model.predict(X) == 1)).sum()

# calculate FN
FN = ((Y ==1) & (model.predict(X) == 0)).sum()

# calculate FPR
FPR = FP / (FP + TN)

# calculate FNR
FNR = FN / (FN + TP)

# calculate BER
BER = 0.5 * (FPR + FNR)

# convert these value to standard Python types
TP = float(TP)
TN = float(TN)
FP = float(FP)
FN = float(FN)
BER = float(BER)

In [None]:
answers["Q7"] = [TP, TN, FP, FN, BER]

In [None]:
assertFloatList(answers['Q7'], 5)

# Question 8

In [None]:
# calculate scores
scores = model.decision_function(X)
len(scores)

20403

In [None]:
# concate scores and Y
scores_Y = list(zip(scores, Y))

Note: highest score most confident

In [None]:
# Sort scores_Y decsenting order
sorted_scores_Y = sorted(scores_Y, key = lambda x: x[0], reverse = True)
sorted_scores_Y[:10]

[(4.038066673916104, 0),
 (2.7298688337326604, 0),
 (2.7298688337326604, 0),
 (2.542983427992169, 0),
 (2.542983427992169, 0),
 (2.3560980222516767, 0),
 (2.1692126165111847, 0),
 (2.1692126165111847, 0),
 (2.1692126165111847, 0),
 (1.982327210770693, 0)]

In [None]:
precisionList = []

In [None]:
ks = [1, 10, 100, 1000, 10000]

# Calculate precisions @ ks
for k in ks:
  TP = 0
  for i in range(k):
    if sorted_scores_Y[i][0] > 0 and (sorted_scores_Y[i][1] == 1):
      TP += 1
  precision = TP / k
  precisionList.append(precision)

In [None]:
answers['Q8'] = precisionList

In [None]:
assertFloatList(answers['Q8'], 5) #List of five floats

# Write Answer to File


In [None]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()