In [92]:
import json
import gzip
import math
from collections import defaultdict
import numpy
from sklearn import linear_model
import random
import statistics

TQDM_ON = True
if TQDM_ON:
    from tqdm import tqdm

z = gzip.open("train.json.gz")

dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

z.close()

In [53]:
answers = {}

In [54]:
dataset[1]

{'userID': 'u70666506',
 'early_access': False,
 'hours': 63.5,
 'hours_transformed': 6.011227255423254,
 'found_funny': 1,
 'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)',
 'gameID': 'g49368897',
 'user_id': '76561198030408772',
 'date': '2017-05-20'}

In [55]:
# def MSE(y, ypred):
#     if isinstance(y,numpy.ndarray):
#         y = y.reshape((-1,))
#     if isinstance(ypred,numpy.ndarray):
#         ypred = ypred.reshape((-1,))
#     if len(y)!=len(ypred):
#         raise ValueError("len(y) don't equal len(ypred)")
#     sq_err_sum = 0.0
#     for yl,yp in zip(y,ypred):
#         sq_err_sum+=(yl-yp)**2
#     return sq_err_sum/len(y)

# def MAE(y, ypred):
#     if isinstance(y,numpy.ndarray):
#         y = y.reshape((-1,))
#     if isinstance(ypred,numpy.ndarray):
#         ypred = ypred.reshape((-1,))
#     if len(y)!=len(ypred):
#         raise ValueError("len(y) don't equal len(ypred)")
#     abs_err_sum = 0.0
#     for yl,yp in zip(y,ypred):
#         abs_err_sum+=abs(yl-yp)
#     return abs_err_sum/len(y)

def MSE(y, ypred):
    assert isinstance(y,numpy.ndarray)
    assert isinstance(ypred,numpy.ndarray)
    assert y.shape==ypred.shape
    sq_err = (y-ypred)**2
    return numpy.mean(sq_err)

def MAE(y, ypred):
    assert isinstance(y,numpy.ndarray)
    assert isinstance(ypred,numpy.ndarray)
    assert y.shape==ypred.shape
    abs_err = numpy.abs(y-ypred)
    return numpy.mean(abs_err)

In [56]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [57]:
def feat1(d):
    return [1.0,float(d['hours'])]

X = list(feat1(d) for d in dataset)
X = numpy.array(X)
y = list(float(len(d['text'])) for d in dataset)
y = numpy.array(y)
y = y.reshape((-1,1))
mod = linear_model.LinearRegression()
mod.fit(X,y)

In [58]:
theta_1 = mod.coef_[0][1]
ypred = mod.predict(X)
mse_q1 = MSE(y,ypred)
answers['Q1'] = [theta_1, mse_q1]

In [59]:
answers

{'Q1': [0.007857269704336025, 570936.2842458971]}

In [60]:
def lin_reg_pipeline(get_xy_func):
    Xs = list()
    ys = list()
    for d in dataset:
        x1,y1 = get_xy_func(d)
        Xs.append(x1)
        ys.append(y1)
    Xs = numpy.array(Xs,dtype=float)
    if len(Xs.shape)==1:
        Xs = Xs.reshape((-1,1))
    ys = numpy.array(ys,dtype=float)
    print("average of y:"+str(numpy.average(ys)))
    print("variance of y:"+str(numpy.var(ys,ddof=1)))
    if len(ys.shape)==1:
        ys = ys.reshape((-1,1))
    model = linear_model.LinearRegression(fit_intercept=False)
    model.fit(Xs,ys)
    ypred = model.predict(Xs)
    return model,MSE(y,ypred),MAE(y,ypred)

In [61]:
def t_transform(t):
    return math.log2(t+1)

In [62]:
all_hours = list(d['hours'] for d in dataset)

def calculate_median(l):
    sorted_l = sorted(l)
    list_length = len(sorted_l)   
    if list_length == 0:
        return None
    if list_length % 2 == 1:
        median = sorted_l[list_length // 2]
    else:
        m1 = sorted_l[list_length // 2 - 1]
        m2 = sorted_l[list_length // 2]
        median = (m1 + m2) / 2
    return median

median_play_time = calculate_median(all_hours)

def get_xy_2(d):
    l = float(len(d['text']))
    t = float(d['hours'])
    return [1.0,t,t_transform(t),math.sqrt(t),int(t>median_play_time)],l

mod2,mse2,mae2 = lin_reg_pipeline(get_xy_2)
answers['Q2'] = mse2

average of y:390.9600857142857
variance of y:570944.2224938355


In [63]:
def get_xy_3(d):
    l = float(len(d['text']))
    t = float(d['hours'])
    x1 = list()
    x1.append(1.0)
    for t_ref in [1,5,10,100,1000]:
        x1.append(int(t>t_ref))
    return x1,l

mod3,mse3,mae3 = lin_reg_pipeline(get_xy_3)
answers['Q3'] = mse3

average of y:390.9600857142857
variance of y:570944.2224938355


In [64]:
# def get_xy_4(d):
#     l = float(len(d['text']))
#     t = float(d['hours'])
#     return [1.0,l],t

# mod4,mse4,mae4 = lin_reg_pipeline(get_xy_4)
# answers['Q4'] = [mse4, mae4, "mae is better, because review_len and time_played are not so relevant, the mse of the model is extremly big"]

In [65]:
def feat4(d):
    return [1.0,float(len(d['text']))]

X = [feat4(d) for d in dataset]
X = numpy.array(X)
y = [[float(d['hours'])] for d in dataset]
y = numpy.array(y)

mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

mse = MSE(y,predictions)
mae = MAE(y,predictions)
answers['Q4'] = [mse, mae, "mae is better, because review_len and time_played are not so relevant, the mse of the model is extremly big"]

In [66]:
y_trans = numpy.vectorize(t_transform)(y)

In [67]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)
mod5 = mod

In [68]:
mse_trans = MSE(y_trans,predictions_trans)

In [69]:
mse_untrans =  MSE(y,2**predictions_trans-1)

In [70]:
answers['Q5'] = [mse_trans, mse_untrans]

In [71]:
def get_1hot(l):
    res = list()
    for _ in range(l):
        res.append(0)
    return res
        

def feat6(d):
    h = float(d['hours'])
    int_h = int(h)
    if int_h>=99:
        int_h=99
    res = get_1hot(100)
    res[int_h]=1.0
    return res
    
X = [feat6(d) for d in dataset]
X = numpy.array(X)
y = [len(d['text']) for d in dataset]
y = numpy.array(y)
y=y.reshape((-1,1))
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [72]:
models = {}
mses = {}
bestC = None

for c in [1, 10, 100, 1000, 10000]:
    model = linear_model.Ridge(alpha=float(c))
    model.fit(Xtrain,ytrain)
    models[c] = model
    mse_valid = MSE(yvalid,model.predict(Xvalid))
    mses[c] = mse_valid
    if bestC==None:
        bestC = c
    else:
        if mse_valid<mses[bestC]:
            bestC = c

In [73]:
mse_valid = mses[bestC]
mse_test = MSE(ytest,model.predict(Xtest))
answers['Q6'] = [bestC, mse_valid, mse_test]

In [74]:
for d in dataset:
    d['hours_transformed'] = t_transform(d['hours'])
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [75]:
less_than_1h_cnt = 0
for d in dataset:
    if d['hours']<1.0:
        less_than_1h_cnt+=1

In [76]:
answers['Q7'] = [median, less_than_1h_cnt]

In [77]:
def feat8(d):
    return [1.0,float(len(d['text']))]
X = [feat8(d) for d in dataset]
y = [d['hours_transformed'] > median for d in dataset]

In [78]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [79]:
def get_performance_info(y,predictions):
    y = numpy.array(y,dtype=int)
    y = y.reshape((-1,))
    predictions = numpy.array(predictions,dtype=int)
    predictions = predictions.reshape((-1,))
    # print(y_actual)
    # print(y_predict)
    TP = numpy.sum((y == 1) & (predictions == 1))
    FP = numpy.sum((y == 0) & (predictions == 1))
    TN = numpy.sum((y == 0) & (predictions == 0))
    FN = numpy.sum((y == 1) & (predictions == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    return TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

In [80]:
TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER = get_performance_info(y,predictions)
answers['Q8'] = [TP, TN, FP, FN, BER]

In [81]:
answers

{'Q1': [0.007857269704336025, 570936.2842458971],
 'Q2': 565419.5340402178,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'mae is better, because review_len and time_played are not so relevant, the mse of the model is extremly big'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [1000, 581432.8208480754, 562824.2275086499],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679]}

In [82]:
# TODO : Q9 Q10

In [83]:
scores = mod.decision_function(X)
score_labels = list(zip(scores,y))
score_labels.sort(reverse=True)
sorted_labels = [tup[1] for tup in score_labels]

In [84]:
ans_q9 = list()
for k in [5,10,100,1000]:
    ths = score_labels[k-1][0]
    l,r=k-1,len(score_labels)-1
    while l<r:
        m=(l+r+1)//2
        confidence_m = score_labels[m][0]
        if confidence_m<ths:
            r=m-1
        else:
            l=m
    k_actual=l+1
    ans_q9.append(sum(sorted_labels[:k_actual])/k_actual)

In [85]:
answers["Q9"]=ans_q9

In [95]:
mod5_regression_y = mod5.predict(X)

In [90]:
def search_max(l,r,iter_times,each_split,cal_performance_func):
    iter_range = range(iter_times)
    if TQDM_ON:
        iter_range = tqdm(iter_range)
    for i in iter_range:
        thsld_with_performance = list()
        split_i_range = range(1,each_split)
        each_split_size = (r-l)/each_split
        if TQDM_ON:
            split_i_range = tqdm(split_i_range)
        for s_i in split_i_range:
            x_s_i = l+each_split_size*s_i
            # my_pred_play_model2(x_s_i)
            # accu_this = calculate_pred_play_accu()
            performance_this = cal_performance_func(x_s_i)
            thsld_with_performance.append((x_s_i,performance_this))
            thsld_with_performance.sort(key=lambda tup:tup[1],reverse=True)
            ths_max_performance = thsld_with_performance[0][0]
            performance = thsld_with_performance[0][1]
            l,r = ths_max_performance-each_split_size,ths_max_performance+each_split_size
    return (ths_max_performance,performance)

In [89]:
def calculate_performance(ths):
    ypred = mod5_regression_y>ths
    return -get_performance_info(y,ypred)[8]

In [96]:
ths_best,ber_best_neg=search_max(0.0,10.0,6,6,calculate_performance)

100%|██████████| 5/5 [00:00<00:00, 88.56it/s]
100%|██████████| 5/5 [00:00<00:00, 104.05it/s]
100%|██████████| 5/5 [00:00<00:00, 119.64it/s]
100%|██████████| 5/5 [00:00<00:00, 130.48it/s]
100%|██████████| 5/5 [00:00<00:00, 120.16it/s]
100%|██████████| 5/5 [00:00<00:00, 119.00it/s]
100%|██████████| 6/6 [00:00<00:00, 20.12it/s]


In [99]:
answers["Q10"] = [ths_best,-ber_best_neg]

In [None]:
# Q11 code
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [None]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

dataTrain[0]

for d in dataTrain:
    uid,item_id,h = d["userID"],d["gameID"],d["hours"]
    userMedian[uid].append(h)
    itemMedian[item_id].append(h)



In [None]:
for u in userMedian:
    userMedian[u] = statistics.median(userMedian[u])

for i in itemMedian:
    itemMedian[i] = statistics.median(itemMedian[i])

In [None]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [None]:
all_times_train = list(d['hours'] for d in dataTrain)
global_median = statistics.median(all_times_train)

def f12(u,i):
    if i in itemMedian:
        return int(itemMedian[i]>global_median)
    if u in userMedian:
        return int(userMedian[u]>global_median)
    return 0

preds = [f12(d['userID'], d['gameID']) for d in dataTest]
y = [int(d['hours']>global_median) for d in dataTest]
correct_cnt = 0
for yl,yp in zip(y,preds):
    if yl==yp:
        correct_cnt+=1
accuracy = correct_cnt/len(y)
answers['Q12'] = accuracy

In [None]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
rating_dict = {}

for d in dataset:
    user,item,tt = d['userID'], d['gameID'],d['hours_transformed']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    rating_dict[(user,item)]=tt

In [None]:
def mostSimilar(i, func, N):
    item_with_sim = list()
    for j in usersPerItem:
        if j==i:
            continue
        sim_j=func(i,j)
        item_with_sim.append((sim_j,j))
    item_with_sim.sort(key=lambda tup:tup[0],reverse=True)
    return item_with_sim[:N]

In [None]:
def jaccard(i,j):
    si = usersPerItem[i]
    sj = usersPerItem[j]
    return len(si.intersection(sj))/len(si.union(sj))



def cos_sim_14(i,j):
    si = usersPerItem[i]
    sj = usersPerItem[j]
    i_norm = math.sqrt(len(si))
    j_norm = math.sqrt(len(sj))
    numerator = 0
    for shared_u in si.intersection(sj):
        numerator+=(1 if rating_dict[(shared_u,i)]>global_median else -1)*(1 if rating_dict[(shared_u,j)]>global_median else -1)
    return numerator/(i_norm*j_norm)

def cos_sim(i,j):
    si = usersPerItem[i]
    sj = usersPerItem[j]
    i_norm = math.sqrt(sum(rating_dict[(u,i)]**2 for u in si))
    j_norm = math.sqrt(sum(rating_dict[(u,j)]**2 for u in sj))
    numerator = 0.0
    for shared_u in si.intersection(sj):
        numerator+=rating_dict[(shared_u,i)]*rating_dict[(shared_u,j)]
    return numerator/(i_norm*j_norm)

ms = mostSimilar(dataset[0]['gameID'], jaccard, 10)
answers['Q13'] = [ms[0][0], ms[-1][0]]

rating_dict = {}
for d in dataset:
    user,item,h = d['userID'], d['gameID'],d['hours']
    rating_dict[(user,item)] = 1 if h>global_median else -1

ms = mostSimilar(dataset[0]['gameID'], cos_sim, 10)
answers['Q14'] = [ms[0][0], ms[-1][0]]

rating_dict = {}
for d in dataset:
    user,item,h = d['userID'], d['gameID'],d['hours_transformed']
    rating_dict[(user,item)] = h

ms = mostSimilar(dataset[0]['gameID'], cos_sim, 10)
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [None]:
answers

{'Q1': [0.007857269704336025, 570936.2842458971],
 'Q2': 565419.5340402178,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'mae is better, because review_len and time_played are not so relevant, the mse of the model is extremly big'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [1000, 581432.8208480754, 562824.2275086499],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.4725063905614679],
 'Q11': [0.5, 3.9],
 'Q12': 0.7410857142857142,
 'Q13': [0.07988165680473373, 0.04390243902439024],
 'Q14': [0.10251693271055495, 0.061667331307041336],
 'Q15': [0.3301567230633554, 0.12290154232706592]}

In [None]:
f = open("answers_midterm.txt", 'w+')
f.write(str(answers) + '\n')
f.close()