In [2]:
import time
from collections import defaultdict
import random
import numpy as np
#import tensorflow as tf
import gzip
import csv
import copy
import pickle
from sklearn import linear_model

In [3]:
with open("user_game_10k_genres.pkl", "rb") as f:
    train_rawdata = pickle.load(f)

In [4]:
with open("user_game_other1k_genres.pkl", "rb") as f:
    valid_rawdata = pickle.load(f)

In [5]:
def parseDataFromFile(fname):
    for l in gzip.open(fname,'rt'):
        yield eval(l)

In [6]:
gamedata =list(parseDataFromFile('./steamData/steam_games.json.gz'))

In [7]:
users = [x[0] for x in train_rawdata]
valid_rawdata = [x for x in valid_rawdata if x[0] in users]

In [8]:
all_data = train_rawdata+valid_rawdata
gameList = set([d[1] for d in all_data])

In [9]:
valid_user_game = [[d[0],d[1]] for d in valid_rawdata]
train_user_game = [[d[0],d[1]] for d in train_rawdata]

In [10]:
userPerGame = defaultdict(set)
gamePerUser = defaultdict(set)
genrePerUser = defaultdict(set)
genrePerGame = defaultdict(set)
for data in all_data:
    u = data[0]
    g = data[1]
    gs= data[2]
    userPerGame[g].add(u)
    gamePerUser[u].add(g)
    for genre in gs:
        genrePerUser[u].add(genre)
        genrePerGame[g].add(genre)

In [11]:
def create_balanced(d):
    balanced = []
    for user,game in d:
        playedGame = gamePerUser[user]
        gameNotPlayed = [k for k in gameList if k not in playedGame]
        SampledGame = gameNotPlayed[random.randint(0,len(gameNotPlayed)-1)]
        if gameNotPlayed == []:
            print(gameNotPlayed)
        while([user,SampledGame] in d):
            SampledGame = gameNotPlayed[random.randint(0,len(gameNotPlayed)-1)]
        balanced.append([user,SampledGame,0])
        balanced.append([user,game,1])
    return balanced

In [12]:
balanced_valid = create_balanced(valid_user_game)

In [13]:
validY = [d[2] for d in balanced_valid ]

In [14]:
balanced_train = create_balanced(valid_user_game)

In [15]:
trainY = [d[2] for d in balanced_train ]

In [16]:
def userGenre(data):
    k = []
    for user,game,_ in data:
        k.append([user,genrePerGame[game]])
    return k

In [17]:
valid = userGenre(balanced_valid)

In [18]:
train = userGenre(balanced_train)

In [28]:
def getGenre(data):
    genre2id = {}
    id2genre = []
    for d in data:
        if 'genres' not in d or 'id' not in d: continue
        genres = d["genres"]
        for genre in genres:
            if genre not in id2genre:
                genre2id[genre] = len(id2genre)
                id2genre.append(genre)
    return genre2id,id2genre

In [31]:
def feature(x):
    feat = [0]*22
    for genre in x[1]:
        feat[genre2id[genre]]=1
    return feat

In [32]:
train_X = [feature(x) for x in train]
valid_X = [feature(x) for x in valid]

In [29]:
genre2id,id2genre = getGenre(gamedata)

In [34]:
accus = []
for c in [0.01,0.1,1,10]:
    model = linear_model.LogisticRegression(C=c,class_weight='balanced',max_iter = 10000000)
    model.fit(train_X,trainY)
    pred = model.predict(valid_X)
    TP_ = np.logical_and(pred, validY)
    FP_ = np.logical_and(pred, np.logical_not(validY))
    TN_ = np.logical_and(np.logical_not(pred), np.logical_not(validY))
    FN_ = np.logical_and(np.logical_not(pred), validY)
    TP = sum(TP_)
    FP = sum(FP_)
    TN = sum(TN_)
    FN = sum(FN_)
    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)
    TNR = TN/(TN+FP)
    FNR = FN/(TP+FN)
    Balanced_Error = 0.5*(FPR+FNR)
    accus.append((TP+TN)/(TP+TN+FP+FN))
    print("True Positive Rate is "+str(TPR)+\
      "\nFalse Positive Rate is "+str(FPR)+\
      "\nTrue Negative Rate is "+str(TNR)+\
     "\nFalse Negative Rate is "+str(FNR)+\
    "\nAccuracy is "+str((TP+TN)/(TP+TN+FP+FN))+
      "\nThe Balanced Error is "+str(Balanced_Error))

True Positive Rate is 0.7248503937007874
False Positive Rate is 0.3746771653543307
True Negative Rate is 0.6253228346456693
False Negative Rate is 0.2751496062992126
Accuracy is 0.6750866141732283
The Balanced Error is 0.32491338582677165
True Positive Rate is 0.7227716535433071
False Positive Rate is 0.36976377952755907
True Negative Rate is 0.630236220472441
False Negative Rate is 0.2772283464566929
Accuracy is 0.676503937007874
The Balanced Error is 0.32349606299212597
True Positive Rate is 0.7148346456692913
False Positive Rate is 0.3568503937007874
True Negative Rate is 0.6431496062992126
False Negative Rate is 0.28516535433070866
Accuracy is 0.678992125984252
The Balanced Error is 0.32100787401574804
True Positive Rate is 0.7148346456692913
False Positive Rate is 0.3568503937007874
True Negative Rate is 0.6431496062992126
False Negative Rate is 0.28516535433070866
Accuracy is 0.678992125984252
The Balanced Error is 0.32100787401574804


In [1]:
def feature(x):
    feat = [0]*22
    for genre in x:
        feat[genre2id[genre]]=1
    return feat

In [141]:
users = list(set([x[0] for x in all_data]))
games = list(set([x[1] for x in valid_rawdata]))

def mAP_mAR_k(k):
    APs = []
    ARs = []
    for user in users:
        preds = [model.predict_proba(np.array(feature(list(genrePerGame[game]))).reshape(1,-1)) for game in games]
        # because your linear model has no user input
        # so this is actually equavalent to [predict(game) for game in games]
        predict = []
        for i,game in enumerate(games):
            predict.append([preds[i][0][0],preds[i][0][1],game])
        predict.sort()
        topk = [x[2] for x in predict[:k]]
        topk = set(topk)
        
        ground_truth = gamePerUser[user]
        ground_truth = set(ground_truth)
        
        intersection = topk.intersection(ground_truth)
        APs.append(len(intersection) / k)
        ARs.append(len(intersection) / len(ground_truth))
    
    AP = sum(APs) / len(APs)
    AR = sum(ARs) / len(ARs)
    return AP, AR

In [143]:
AP3,AR3 = mAP_mAR_k(10)

In [144]:
AP3,AR3

(0.015793132211872472, 0.01008826348106289)