In [1]:
import gzip
from collections import defaultdict
import random
from tqdm import tqdm
import numpy as np

In [2]:
def readJSON(path):
    for l in gzip.open(path, 'rt', encoding="utf8"):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d

def parseData(path):
    for l in gzip.open(path, 'rt', encoding="utf8"):
        yield eval(l)

dataset = list(parseData("train.json.gz"))

In [3]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [6]:
# Build the dictionaries as usual (directly from the whole dataset)
usersPerGame, gamesPerUser = {}, {}
for d in dataset:
    if d['userID'] not in gamesPerUser:
        gamesPerUser[d['userID']] = {d['gameID']}
    else:
        gamesPerUser[d['userID']].add(d['gameID'])
    if d['gameID'] not in usersPerGame:
        usersPerGame[d['gameID']] = {d['userID']}
    else:
        usersPerGame[d['gameID']].add(d['userID'])

In [7]:
Jac = {}  # A dictionary storing the jaccard similarity for each game that each user in the test set has played
gamesPerUserPerGame = {}  # For each game g in the test set, union all the games played by the user who play that g

for l in tqdm(open("pairs_Played.txt")):
    jacs = []
    if l.startswith("userID"):
        # header
        continue
    u, g = l.strip().split('-')

    if g not in gamesPerUserPerGame:
        games_ = set({})
        for user in usersPerGame[g]:
            games_ = games_.union(gamesPerUser[user])
        gamesPerUserPerGame[g] = games_
    games = gamesPerUserPerGame[g]  # Make a copy for Jaccard Similarity later
    
    if u in usersPerGame:
        # If the user has appeared in the training set before
        for game in gamesPerUser[u]:
            if game not in gamesPerUserPerGame:
                games_ = set({})
                for user in usersPerGame[game]:
                    games_ = games_.union(gamesPerUser[user])
                gamesPerUserPerGame[game] = games_
            jac = Jaccard(games, gamesPerUserPerGame[game])
            jacs.append(jac)
        # Writing Jaccard results to the dictionary
        if u not in Jac:
            Jac[u] = {g:np.max(jacs)}
        else:
            Jac[u][g] = np.max(jacs)
    else:
        # Unknown User
        if u not in Jac:
            Jac[u] = {g:0}
        else:
            Jac[u][g] = 0

20001it [00:06, 2953.73it/s]


In [8]:
import operator

preds = {}
for user in Jac.keys():
    preds[user] = {}
    
    # Sort the Jaccard values for each user
    sortedJac = sorted(Jac[user].items(), key=operator.itemgetter(1), reverse=True)
    l = len(sortedJac)
    for i in range(l):
        # Predict Played for the first half
        if i <= (l/2-1):
            preds[user][sortedJac[i][0]] = 1
        else:
            preds[user][sortedJac[i][0]] = 0

In [9]:
predictions = open("predictions_Played_A1.txt", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, g = l.strip().split('-')
    predictions.write(u + '-' + g + "," + str(preds[u][g]) + "\n")
    
predictions.close()