In [26]:
import random
import numpy as np
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
answers = {}

In [5]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [6]:
data = list(parseData("beer_50000.json"))

In [7]:
random.seed(0)
random.shuffle(data)

In [8]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [9]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [10]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [12]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [13]:
catID = dict(zip(list(categories),range(len(categories))))

In [19]:
review_column_names = [kname for kname in data[0] if (("review" in kname) and isinstance(data[0][kname],float))]

In [24]:
max_text_len = max(len(datum['review/text']) for datum in data)

In [32]:
def feat(datum, includeCat = True, includeReview = True, includeLength = True, ret_np_array = False):
    res = list()
    if includeCat:
        catID_1hot_vector = [0.0]*(len(catID))
        style_this = datum['beer/style']
        if style_this in catID:
            catID_1hot_vector[catID[style_this]]=1.0
        res.extend(catID_1hot_vector)
    if includeReview:
        for review_col_name in review_column_names:
            res.append(datum[review_col_name]/5.0)
    if includeLength:
        res.append(len(datum['review/text'])/max_text_len)
    assert len(res)>0,f"the feat function returns no feature for datum {datum}"
    if ret_np_array:
        res = np.array(res,dtype=float)
    return res


In [47]:
# lin_reg = linear_model.LinearRegression()
# lin_reg.fit()
# lin_reg.predict()

def get_performance_info(y_actual,y_predict):
    if not isinstance(y_actual,np.ndarray):
        y_actual = np.array(y_actual)
    y_actual = y_actual.reshape((-1,))
    y_predict = y_predict.reshape((-1,))
    TP = np.sum((y_actual == 1) & (y_predict == 1))
    FP = np.sum((y_actual == 0) & (y_predict == 1))
    TN = np.sum((y_actual == 0) & (y_predict == 0))
    FN = np.sum((y_actual == 1) & (y_predict == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    return TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

def pipeline(reg, includeCat = True, includeReview = True, includeLength = True):
    get_x_row = lambda datum:feat(datum,includeCat=includeCat,includeReview=includeReview,includeLength=includeLength)
    get_all_x = lambda data:np.array(list(get_x_row(datum) for datum in data),dtype=float)
    x_train = get_all_x(dataTrain)
    x_valid = get_all_x(dataValid)
    x_test = get_all_x(dataTest)
    logisticRegModel = linear_model.LogisticRegression(class_weight="balanced",penalty="l2",C=reg)
    logisticRegModel.fit(x_train,yTrain)
    y_pred_valid = logisticRegModel.predict(x_valid)>=0.5
    y_pred_test = logisticRegModel.predict(x_test)>=0.5
    return logisticRegModel,get_performance_info(yValid,y_pred_valid)[-1],get_performance_info(yTest,y_pred_test)[-1]




In [48]:
mod, validBER, testBER = pipeline(10, True, False, False)

In [53]:
answers['Q1'] = [validBER, testBER]

In [55]:
mod2, validBER2, testBER2 = pipeline(10, True, True, True)
answers['Q2'] = [validBER2, testBER2]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
best_c,best_model,min_ber_valid,ber_test = 0,None,1.0,1.0
for c in [0.001, 0.01, 0.1, 1, 10]:
    model,ber_valid,b_t_this = pipeline(c,True,True,True)
    if ber_valid<min_ber_valid:
        best_c = c
        best_model = model
        min_ber_valid = ber_valid
        ber_test = b_t_this

answers['Q3'] = [best_c,min_ber_valid,ber_test] 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
mod, validBER, testBER_noCat = pipeline(1.0,False,True,True)
mod, validBER, testBER_noReview = pipeline(1.0,True,False,True)
mod, validBER, testBER_noLength = pipeline(1.0,True,True,False)
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]

In [60]:
answers

{'Q1': [0.16130237168160533, 0.1607838024608832],
 'Q2': [0.14190530394736312, 0.1430222366785745],
 'Q3': [1, 0.14142076181125796, 0.14220884792124866],
 'Q4': [0.3122273694930058, 0.16109632033831978, 0.1461270153739065]}