In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
from sklearn import linear_model
import random
import gzip
import numpy as np
from collections import Counter
import os
from utils import auc, jaccard_similarity

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt', encoding = 'utf8'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [4]:
dataset_folder_path = os.path.join(os.getcwd(),"dataset")

user_recommend = {}
for  d in readGz(os.path.join(dataset_folder_path,"australian_user_reviews.json.gz")):
    user_recommend[d['user_id']] = [review["item_id"] for review in d['reviews'] if review["recommend"]==True]


In [5]:

# collect all the genres, tags, specs info for each game

steam_games_genres = {}

missing_genres_count = 0
missing_tags_count = 0
missing_specs_count = 0
missing_id_count = 0
rep_items = 0

for d in readGz(os.path.join(dataset_folder_path,"steam_games.json.gz")):

    if 'id' not in d.keys():
        print(f"This game has no id info!")
        print(d)
        missing_id_count += 1
        continue

    if d['id'] in steam_games_genres.keys():
        print(f"This game id ({d['id']}) already exists!")
        continue

    if 'genres' not in d.keys():
        print(f"This game id ({d['id']}) has no genre info!")
        genre_info = []
        missing_genres_count += 1

    else:
        genre_info = d['genres']

    if 'tags' not in d.keys():
        print(f"This game id ({d['id']}) has no tag info!")
        tag_info = []
        missing_tags_count += 1

    else:
        tag_info = d['tags']

    if 'specs' not in d.keys():
        print(f"This game id ({d['id']}) has no specs info!")
        specs_info = []
        missing_specs_count += 1

    else:
        specs_info = d['specs']

    steam_games_genres[d['id']] = {'genres': genre_info,
                                   'tags': tag_info, 'specs': specs_info}

This game id (773570) has no genre info!
This game id (724910) has no genre info!
This game id (772590) has no genre info!
This game id (640250) has no genre info!
This game id (711440) has no genre info!
This game id (777910) has no genre info!
This game id (777910) has no specs info!
This game id (594200) has no genre info!
This game id (680970) has no genre info!
This game id (541930) has no genre info!
This game id (769350) has no genre info!
This game id (374970) has no genre info!
This game has no id info!
{'url': 'http://store.steampowered.com/', 'price': 19.99, 'discount_price': 14.99, 'early_access': False}
This game id (12580) has no genre info!
This game id (12570) has no genre info!
This game id (22340) has no genre info!
This game id (900883) has no genre info!
This game id (31990) has no tag info!
This game id (38440) has no genre info!
This game id (35050) has no tag info!
This game id (39392) has no tag info!
This game id (39391) has no tag info!
This game id (42200) ha

In [6]:
# collect the genre, tags, specs info of each game in each bundle
bundle_games_genres = {}

for d in readGz(os.path.join(dataset_folder_path,"bundle_data.json.gz")):

    if "bundle_id" not in d.keys() or "items" not in d.keys():
        print(f"This bundle data is missing id or items info!")
        print(d)
        continue

    games_ids = [g["item_id"] for g in d["items"]]

    bundle_games_genres[d["bundle_id"]] = {'genres': [], 'tags': [], 'specs': []}
    for item in games_ids:
        if item in steam_games_genres.keys():
            bundle_games_genres[d["bundle_id"]]['genres'].extend(steam_games_genres[item]['genres'])
            bundle_games_genres[d["bundle_id"]]['tags'].extend(steam_games_genres[item]['tags'])
            bundle_games_genres[d["bundle_id"]]['specs'].extend(steam_games_genres[item]['specs'])

    bundle_games_genres[d["bundle_id"]]['genres'] = list(set(bundle_games_genres[d["bundle_id"]]['genres']))
    bundle_games_genres[d["bundle_id"]]['tags'] = list(set(bundle_games_genres[d["bundle_id"]]['tags']))
    bundle_games_genres[d["bundle_id"]]['specs'] = list(set(bundle_games_genres[d["bundle_id"]]['specs']))


In [7]:
bundle_games_genres

{'450': {'genres': ['Adventure', 'Indie', 'Casual', 'Action'],
  'tags': ['Singleplayer',
   'Mature',
   'Casual',
   'Episodic',
   'Nudity',
   "Shoot 'Em Up",
   'Visual Novel',
   'Sexual Content',
   'Romance',
   'Indie',
   'Anime',
   'Dating Sim',
   'Action',
   'Adventure'],
  'specs': ['Single-player',
   'Downloadable Content',
   'Steam Trading Cards',
   'Steam Cloud',
   'Steam Achievements']},
 '1473': {'genres': [], 'tags': [], 'specs': []},
 '1474': {'genres': [], 'tags': [], 'specs': []},
 '1437': {'genres': ['Casual',
   'Strategy',
   'Indie',
   'Sports',
   'Free to Play',
   'Simulation'],
  'tags': ['Casual',
   'Strategy',
   'Indie',
   'Sports',
   'Free to Play',
   'Simulation'],
  'specs': ['Single-player',
   'Full controller support',
   'Downloadable Content',
   'Stats',
   'Steam Leaderboards',
   'In-App Purchases',
   'Steam Trading Cards',
   'Steam Cloud',
   'Steam Achievements']},
 '1466': {'genres': [], 'tags': [], 'specs': []},
 '1478': {'g

In [8]:
# for each user, collect all the genres, tags, specs info from the games they recommended

user_recommend_genres = {}
for user in user_recommend.keys():
    user_recommend_genres[user] = {'genres': [], 'tags': [], 'specs': []}
    for item in user_recommend[user]:
        if item in steam_games_genres.keys():
            user_recommend_genres[user]['genres'].extend(steam_games_genres[item]['genres'])
            user_recommend_genres[user]['tags'].extend(steam_games_genres[item]['tags'])
            user_recommend_genres[user]['specs'].extend(steam_games_genres[item]['specs'])

In [9]:
user_recommend_genres

{'76561197970982479': {'genres': ['Action', 'Action', 'Indie'],
  'tags': ['FPS',
   'Zombies',
   'Co-op',
   'Survival',
   'Action',
   'Multiplayer',
   'Horror',
   'Online Co-Op',
   'Shooter',
   'Gore',
   'Team-Based',
   'First-Person',
   'Moddable',
   'Survival Horror',
   'Great Soundtrack',
   'Singleplayer',
   'Class-Based',
   'Difficult',
   'Comedy',
   'Adventure',
   'Action',
   'Indie',
   'Surreal',
   "Beat 'em up",
   'FPS',
   'Fighting',
   'First-Person',
   'Short',
   'Singleplayer',
   'Adventure',
   'Fantasy',
   'Atmospheric',
   'Story Rich'],
  'specs': ['Single-player',
   'Multi-player',
   'Co-op',
   'Cross-Platform Multiplayer',
   'Steam Achievements',
   'Steam Trading Cards',
   'Steam Workshop',
   'Valve Anti-Cheat enabled',
   'Stats',
   'Includes level editor',
   'Single-player',
   'Steam Achievements',
   'Steam Trading Cards',
   'Steam Cloud']},
 'js41637': {'genres': ['Indie', 'Simulation', 'Adventure', 'Indie'],
  'tags': ['Simu

In [10]:
# find jaccard similarity between games in a bundle and user's preferred genres/tags/specs
# let teh threshold be 0.5 for now

total_len = len(user_recommend_genres.keys())
i = 0


jaccard_similarity_threshold = 0.8
user_bundle_match = defaultdict(list)
for user in user_recommend_genres.keys():

    print(f"Processing user {user} ({list(user_recommend_genres.keys()).index(user)+1}/{total_len})")
    user_genres_set = set(user_recommend_genres[user]['genres'])
    user_tags_set = set(user_recommend_genres[user]['tags'])
    user_specs_set = set(user_recommend_genres[user]['specs'])

    for bundle in bundle_games_genres.keys():
        bundle_genres_set = set(bundle_games_genres[bundle]['genres'])
        bundle_tags_set = set(bundle_games_genres[bundle]['tags'])
        bundle_specs_set = set(bundle_games_genres[bundle]['specs'])

        genre_jaccard = jaccard_similarity(user_genres_set, bundle_genres_set)
        tag_jaccard = jaccard_similarity(user_tags_set, bundle_tags_set)
        specs_jaccard = jaccard_similarity(user_specs_set, bundle_specs_set)

        if (np.mean([genre_jaccard, tag_jaccard, specs_jaccard])
            >= jaccard_similarity_threshold):
            user_bundle_match[user].append(bundle)


Processing user 76561197970982479 (1/25485)
Processing user js41637 (2/25485)
Processing user evcentric (3/25485)
Processing user doctr (4/25485)
Processing user maplemage (5/25485)
Processing user Wackky (6/25485)
Processing user 76561198079601835 (7/25485)
Processing user MeaTCompany (8/25485)
Processing user 76561198089393905 (9/25485)
Processing user 76561198156664158 (10/25485)
Processing user 76561198077246154 (11/25485)
Processing user WeiEDKrSat (12/25485)
Processing user thequeenpanda (13/25485)
Processing user death-hunter (14/25485)
Processing user DJKamBer (15/25485)
Processing user Rainbow-Dashie (16/25485)
Processing user 76561198043472122 (17/25485)
Processing user MarbleShrine (18/25485)
Processing user PPanther (19/25485)
Processing user devvonst (20/25485)
Processing user Fr0stedLine (21/25485)
Processing user starkillershadow553 (22/25485)
Processing user 76561198058373434 (23/25485)
Processing user 76561198048353577 (24/25485)
Processing user 76561198066046412 (25/2

In [None]:
from utils import evaluation_function

tuple_list = [('76561198079601835','236', '891'),
              ('76561198079601835','237', '892')]

evaluation_function(tuple_list, user_bundle_match)

Evaluating test case 1/2
Evaluating test case 2/2


np.float64(0.5)

In [14]:
from utils import evaluation_function_naive

evaluation_function_naive(tuple_list)

This game id (773570) has no genre info!
This game id (724910) has no genre info!
This game id (772590) has no genre info!
This game id (640250) has no genre info!
This game id (711440) has no genre info!
This game id (777910) has no genre info!
This game id (777910) has no specs info!
This game id (594200) has no genre info!
This game id (680970) has no genre info!
This game id (541930) has no genre info!
This game id (769350) has no genre info!
This game id (374970) has no genre info!
This game has no id info!
{'url': 'http://store.steampowered.com/', 'price': 19.99, 'discount_price': 14.99, 'early_access': False}
This game id (12580) has no genre info!
This game id (12570) has no genre info!
This game id (22340) has no genre info!
This game id (900883) has no genre info!
This game id (31990) has no tag info!
This game id (38440) has no genre info!
This game id (35050) has no tag info!
This game id (39392) has no tag info!
This game id (39391) has no tag info!
This game id (42200) ha

np.float64(0.5)