In [1]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import operator
import math
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stevenlouie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Process data files

In [2]:
def eval_file(file):
    f = open(file, "r", encoding="utf-8")
    lst = []
    for line in f:
        obj = eval(line)
        lst.append(obj)
    json.dumps(lst)
    return lst

In [3]:
bundle_data = eval_file("../data/bundle_data.json")
users_items = eval_file("../data/australian_users_items.json")
steam_games = eval_file("../data/steam_games.json")

In [4]:
bundle_df = json_normalize(bundle_data, "items", ["bundle_final_price", "bundle_url", "bundle_price", "bundle_name", "bundle_id"])
steam_games_df = pd.DataFrame(data=steam_games)

# Data Preprocessing

In [5]:
def clean_bundle_data(df):    
    clean_bundle_df = df[df["genre"]!=""]
    return clean_bundle_df

In [6]:
def clean_games_data():
    steam_games_df.drop(labels=["app_name", "tags", "metascore"], axis=1, inplace=True)
    steam_games_df.dropna(subset=["title", "genres"], inplace=True)

In [7]:
def remove_idle_users():
    for i in users_items:
        if i["items_count"]==0:
            users_items.remove(i)

In [8]:
def perform_data_preprocessing():
    clean_games_data()
    remove_idle_users()

In [9]:
def filter_by_playtime(user_id, playtime):
    user_data = {}
    
    for user in users_items:
        if user["user_id"]==user_id:
            user_data = user
            break
    
    filtered_df = json_normalize(user_data, "items", ["user_id", "steam_id", "user_url"])
    filtered_df = filtered_df[filtered_df["playtime_forever"] > playtime]
    
    return filtered_df

In [10]:
def extract_genres_data_from_bundles(bundle_df): 
    
    clean_bundle_df = clean_bundle_data(bundle_df)
    
    filter_genre_from_bundle = {}
    for id_ in clean_bundle_df["bundle_id"].unique():
        genres_list = []
        for genres in clean_bundle_df[clean_bundle_df["bundle_id"]==id_]["genre"]:
            for genre in genres.split(", "):
                if genre not in genres_list:
                    genres_list.append(genre)
        filter_genre_from_bundle[id_] = genres_list
        
    return filter_genre_from_bundle

In [11]:
perform_data_preprocessing()

In [12]:
steam_games_df.head()

Unnamed: 0,publisher,genres,title,url,release_date,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,
5,Trickjump Games Ltd,"[Action, Adventure, Simulation]",Battle Royale Trainer,http://store.steampowered.com/app/772540/Battl...,2018-01-04,,http://steamcommunity.com/app/772540/reviews/?...,"[Single-player, Steam Achievements]",3.99,False,772540,Trickjump Games Ltd,Mixed


In [13]:
all_bundles_genres_data = extract_genres_data_from_bundles(bundle_df)

In [14]:
filtered_playtime_df = filter_by_playtime('76561197970982479', 30)

# Recommend bundle to user based on most played genres

In [46]:
def cosine_sim(vector1, vector2):
    cos = 0
    lst1 = []
    lst2 = []
    vector = list(set().union(vector1, vector2))
    for i in vector:
        if i in vector1:
            lst1.append(1)
        else:
            lst1.append(0)    
        if i in vector2:
            lst2.append(1)
        else:
            lst2.append(0)
        cos += lst1[-1]*lst2[-1]
    
    magnitude = float((sum(lst1)*sum(lst2))**0.5)
    
    if magnitude != 0:
        cos = cos / magnitude
    
    return cos, magnitude

In [47]:
def get_all_genres_by_user(user):
    genre_count = {}
    for index, row in user.iterrows():
        if steam_games_df[steam_games_df["id"]==row["item_id"]].index.any()==True:
            game_genres = steam_games_df[steam_games_df["id"]==row["item_id"]]["genres"].values[0]
            for genre in game_genres:
                if genre not in genre_count:
                    genre_count[genre] = 1
                else:
                    genre_count[genre] = genre_count[genre] + 1

    return sorted(genre_count.items(), key=operator.itemgetter(1), reverse=True)

In [48]:
def parse_genres(genres):
    lst = []
    for genre in genres[:math.ceil(len(genres)*.75)]:
        lst.append(genre[0])
    return lst

In [49]:
def get_bundle_similarity_scores(user_genres, all_bundles_genres_data):
    sim = {}
    
    for id_ in list(all_bundles_genres_data.keys()):
        bundle_genres = all_bundles_genres_data[id_]

        cos,_ = cosine_sim(user_genres, bundle_genres)
        
        sim[id_] = cos
    
    return sorted(sim.items(), key=operator.itemgetter(1), reverse=True)

In [50]:
all_user_genres = get_all_genres_by_user(filtered_playtime_df)

In [51]:
top_rated_genres = parse_genres(all_user_genres)

In [52]:
bundle_sim = get_bundle_similarity_scores(top_rated_genres, all_bundles_genres_data)

In [53]:
bundle_sim

[('803', 0.9428090415820635),
 ('813', 0.9428090415820635),
 ('580', 0.8888888888888888),
 ('948', 0.8888888888888888),
 ('396', 0.8819171036881969),
 ('505', 0.8819171036881969),
 ('364', 0.8819171036881969),
 ('399', 0.8819171036881969),
 ('482', 0.8819171036881969),
 ('957', 0.8819171036881969),
 ('802', 0.8819171036881969),
 ('804', 0.8819171036881969),
 ('403', 0.8819171036881969),
 ('395', 0.8819171036881969),
 ('542', 0.8819171036881969),
 ('356', 0.8819171036881969),
 ('268', 0.8432740427115678),
 ('808', 0.8249579113843055),
 ('833', 0.8249579113843055),
 ('425', 0.8249579113843055),
 ('663', 0.8249579113843055),
 ('225', 0.8249579113843055),
 ('565', 0.8249579113843055),
 ('1437', 0.816496580927726),
 ('1441', 0.816496580927726),
 ('501', 0.816496580927726),
 ('801', 0.816496580927726),
 ('1206', 0.816496580927726),
 ('1143', 0.816496580927726),
 ('398', 0.816496580927726),
 ('824', 0.816496580927726),
 ('569', 0.816496580927726),
 ('646', 0.816496580927726),
 ('746', 0.81649

In [54]:
def get_bundle_recommendations(user_id, num_recommendations, sim, bundle_data=bundle_data):
    
    recommendations = []
    for id_ in sim[:num_recommendations]:
        for bundle in bundle_data:
            if bundle["bundle_id"]==id_[0]:
                recommendations.append(bundle)
                break
    
    return recommendations

In [55]:
recommendations = get_bundle_recommendations('76561197970982479', 3, sim)

In [56]:
recommendations

[{'bundle_final_price': '$4.87',
  'bundle_url': 'http://store.steampowered.com/bundle/803/?utm_source=SteamDB&utm_medium=SteamDB&utm_campaign=SteamDB%20Bundles%20Page',
  'bundle_price': '$32.41',
  'bundle_name': 'Platformer Bundle',
  'bundle_id': '803',
  'items': [{'genre': 'Action, Adventure, Indie, Simulation, Strategy',
    'item_id': '104200',
    'discounted_price': '$0.49',
    'item_url': 'http://store.steampowered.com/app/104200',
    'item_name': 'BEEP'},
   {'genre': 'Action, Adventure, Indie, Simulation, Strategy',
    'item_id': '263980',
    'discounted_price': '$1.99',
    'item_url': 'http://store.steampowered.com/app/263980',
    'item_name': 'Out There Somewhere'},
   {'genre': 'Action, Adventure, Indie',
    'item_id': '317250',
    'discounted_price': '$9.99',
    'item_url': 'http://store.steampowered.com/app/317250',
    'item_name': 'Airscape - The Fall of Gravity'},
   {'genre': 'Action, Adventure, Casual, Indie',
    'item_id': '341500',
    'discounted_pri

# Group Generation

In [65]:
def group_generation(user_id, num_of_users):
    user_data = {}
    
    for user in users_items:
        if user["user_id"]==user_id:
            user_data = user
            break
            
    filtered_df = json_normalize(user_data, "items", ["user_id", "steam_id", "user_url"])
    filtered_df = filtered_df[filtered_df["playtime_forever"] > 100]
    
    genres = get_all_genres_by_user(filtered_df)
    top_rated_genres = parse_genres(genres)
    sim = {}
    counter = 0
    
    np.random.shuffle(users_items)
    for user in users_items:
        if user["user_id"]!=user_id:
            user_df = json_normalize(user, "items", ["user_id", "steam_id", "user_url"])
            g = get_all_genres_by_user(user_df)
            parsed_genres = parse_genres(g)
            
            cos, magnitude = cosine_sim(top_rated_genres, parsed_genres)
#             lst1 = []
#             lst2 = []
#             cos = 0
#             vector = list(set().union(top_rated_genres, parsed_genres))
#             for i in vector:
#                 if i in top_rated_genres:
#                     lst1.append(1)
#                 else:
#                     lst1.append(0)

#                 if i in parsed_genres:
#                     lst2.append(1)
#                 else:
#                     lst2.append(0)
#                 cos += lst1[-1]*lst2[-1]
            
            if magnitude==0:
                continue
                
            if cos >= 0.85:
                counter += 1
                sim[user["user_id"]] = cos
                
            if counter == num_of_users:
                break
                    
                
    return sorted(sim, key=operator.itemgetter(1), reverse=True), top_rated_genres

In [66]:
user_group, top_genres = group_generation('76561197970982479', 100)

In [68]:
user_group

['turtle_69',
 'DuckPencil',
 'Kurisuchan27',
 'pumpkinpulp',
 'FrostBladeGaming',
 'Trandunz',
 'drilljose',
 'Dragonzarefriendly',
 'brink31',
 'CptLG',
 '5p1ca',
 'pp0085',
 'Tokusentai',
 'jonaspwnes',
 'HoChingMongoloid',
 'Moist_Cookie',
 'lordofgeckos',
 'inourmumsbed',
 'SnottiestDuck',
 'ClaxtonBRAH',
 'tkjohn',
 'nightwingrocks',
 'Hidden_Hood',
 'kidfable',
 'Nickz_',
 'LickmyrideOnmowa',
 'sionisindustries',
 'mistermooth',
 'theamazingflute',
 'thatonestatuething',
 'thisishung',
 'Deus_VuIt',
 'dekruz',
 'TehStoneMan',
 'neurotoxn',
 'derpderpderpherp',
 'jedjezzah',
 'RenegadeFoxx',
 'lcs_666',
 'uber_gamer92',
 'Kasplodger',
 'dayfoe',
 'TandGames',
 'bassline101',
 'gabennnnnnnnnn',
 '_aquasock',
 'Rabwick',
 'BatatasSamGostozas',
 'darkedge119',
 'GatoGlant',
 'HappyGamerX',
 'cato_',
 'a_nice_lad',
 'DONTQQ2MUCH',
 'ILIKEYOURPANTS',
 '76561198088353235',
 '76561198095238263',
 '76561198060891034',
 '76561198055609345',
 '76561198072471193',
 '76561198072973126',
 '76

# Bundle Generation for Recommendation

In [71]:
def get_multiplayer_games(steam_games):
    games = steam_games.copy()
    
    for i, row in games.iterrows():
        flag = False
        if type(row["specs"]) != list:
            games.drop([i], inplace=True)
            continue
            
        for spec in row["specs"]:
            if spec=="Online Multi-Player":
                flag = True
                break
                
        if flag==False:
            games.drop([i], inplace=True)
            
    games.reset_index(inplace=True)
    games["discount_price"].fillna(0, inplace=True)
    games["price"].fillna(0, inplace=True)
            
    return games

In [72]:
multiplayer_games = get_multiplayer_games(steam_games_df)

In [76]:
def get_most_similar_games(games, user_group, top_genres, num_of_games):
    sim = {}
    for i, row in games.iterrows():
        game_genres = row["genres"]
        
        cos, magnitude = cosine_sim(top_genres, game_genres)
        
        if magnitude==0:
            continue
                
        sim[row["id"]] = cos
    
    sim = sorted(sim.items(), key=operator.itemgetter(1), reverse=True)
    
    games_list = []
    for i in sim[:num_of_games]:
        games_list.append(i[0])
        
    return games_list

In [77]:
list_of_games = get_most_similar_games(multiplayer_games, user_group, top_genres, 10)

In [78]:
list_of_games

['413120',
 '666600',
 '515710',
 '638180',
 '462440',
 '254200',
 '434380',
 '366690',
 '562700',
 '582660']

In [79]:
def bundle_generation(games):
    bundle = {}
    bundle["bundle_name"] = "Made just for you"
    bundle["bundle_id"] = "10000000"
    bundle["items"] = []
    discounted_total = 0
    total_price = 0
    for id_ in games:
        game = multiplayer_games[multiplayer_games["id"]==id_]
        
        item_info = {}
        item_info["item_id"] = id_
        item_info["genre"] = ', '.join(map(str, game["genres"].values[0]))
        item_info["name"] = game["title"].values[0]
        
        if type(game["price"]) != str:
            total_price += game["price"].values[0]
        
        discounted_total += game["discount_price"].values[0]
        
        bundle["items"].append(item_info)
    
    bundle["bundle_final_price"] = total_price - discounted_total
    
    return bundle

In [80]:
bundle = bundle_generation(list_of_games)

In [81]:
bundle

{'bundle_name': 'Made just for you',
 'bundle_id': '10000000',
 'items': [{'item_id': '413120',
   'genre': 'Action, Casual, Indie, Massively Multiplayer, Simulation, Sports, Strategy',
   'name': 'Tactics Forever'},
  {'item_id': '666600',
   'genre': 'Action, Adventure, Casual, Indie, RPG, Simulation, Strategy',
   'name': 'Zombie Town'},
  {'item_id': '515710',
   'genre': 'Action, Adventure, Casual, Free to Play, Indie, Massively Multiplayer, RPG, Simulation',
   'name': 'Creativerse - Pro'},
  {'item_id': '638180',
   'genre': 'Action, Adventure, Casual, Free to Play, Indie, Massively Multiplayer, RPG, Simulation',
   'name': 'Creativerse - Welcome Bundle'},
  {'item_id': '462440',
   'genre': 'Action, Adventure, Indie, Massively Multiplayer, RPG, Simulation, Strategy, Early Access',
   'name': 'ROKH'},
  {'item_id': '254200',
   'genre': 'Adventure, Casual, Indie, RPG, Simulation, Strategy',
   'name': 'FortressCraft Evolved!'},
  {'item_id': '434380',
   'genre': 'Action, Advent

In [82]:
genres_from_generated_bundle = extract_genres_data_from_bundles(json_normalize(bundle, "items", ["bundle_final_price", "bundle_name", "bundle_id"]))

In [86]:
def evaluate_generated_bundle(user_group, all_bundles_genres_data, bundle_sim):
    sim = {}
    for user in user_group:
        user_data = {}
        
        for u in users_items:
            if u["user_id"]==user:
                user_data = u
                break

        filtered_df = json_normalize(user_data, "items", ["user_id", "steam_id", "user_url"])
        filtered_df = filtered_df[filtered_df["playtime_forever"] > 100]

        genres = get_all_genres_by_user(filtered_df)
        top_rated_genres = parse_genres(genres)
        
        for id_ in list(all_bundles_genres_data.keys()):
            if id_ not in sim:
                sim[id_] = 0
                
            bundle_genres = all_bundles_genres_data[id_]

            cos,_ = cosine_sim(top_rated_genres, bundle_genres)

            sim[id_] += cos
            
        for id_ in list(bundle_sim.keys()):
            if id_ not in sim:
                sim[id_] = 0
                
            bundle_genres = bundle_sim[id_]

            cos,_ = cosine_sim(top_rated_genres, bundle_genres)

            sim[id_] += cos
            
    return sorted(sim.items(), key=operator.itemgetter(1), reverse=True)

In [87]:
bundle_similarities = evaluate_generated_bundle(user_group, all_bundles_genres_data, genres_from_generated_bundle)

In [88]:
bundle_similarities

[('808', 85.16230636787046),
 ('10000000', 84.3526047071822),
 ('948', 83.61771094210545),
 ('396', 83.00540812945802),
 ('505', 83.00540812945802),
 ('364', 83.00540812945802),
 ('399', 83.00540812945802),
 ('482', 83.00540812945802),
 ('957', 83.00540812945802),
 ('804', 83.00540812945802),
 ('403', 83.00540812945802),
 ('542', 83.00540812945802),
 ('356', 83.00540812945802),
 ('803', 80.34881384050736),
 ('813', 80.34881384050736),
 ('225', 79.9370207819205),
 ('663', 79.40253101482001),
 ('646', 79.28959222820168),
 ('339', 79.28959222820168),
 ('833', 78.79181918760112),
 ('565', 78.79181918760112),
 ('580', 77.91504499156412),
 ('824', 77.77311022662288),
 ('623', 77.77311022662288),
 ('1206', 77.60950555554861),
 ('398', 77.60950555554861),
 ('1441', 76.698135397841),
 ('501', 76.698135397841),
 ('801', 76.698135397841),
 ('432', 76.698135397841),
 ('436', 76.698135397841),
 ('332', 75.53421550363305),
 ('746', 75.25379517284769),
 ('457', 75.25379517284769),
 ('268', 74.9429403