In [None]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import operator
import math
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 

# Process data files

In [None]:
def eval_file(file):
    f = open(file, "r", encoding="utf-8")
    lst = []
    for line in f:
        obj = eval(line)
        lst.append(obj)
    json.dumps(lst)
    return lst

In [None]:
bundle_data = eval_file("../data/bundle_data.json")
users_items = eval_file("../data/australian_users_items.json")
steam_games = eval_file("../data/steam_games.json")

In [None]:
bundle_df = json_normalize(bundle_data, "items", ["bundle_final_price", "bundle_url", "bundle_price", "bundle_name", "bundle_id"])
steam_games_df = pd.DataFrame(data=steam_games)

# Data Preprocessing

In [None]:
def clean_bundle_data():    
    clean_bundle_df = bundle_df[bundle_df["genre"]!=""]
    return clean_bundle_df

In [None]:
def clean_games_data():
    steam_games_df.drop(labels=["app_name", "tags", "metascore"], axis=1, inplace=True)
    steam_games_df.dropna(subset=["title", "genres"], inplace=True)

In [None]:
def remove_idle_users():
    for i in users_items:
        if i["items_count"]==0:
            users_items.remove(i)

In [None]:
def perform_data_preprocessing():
    clean_games_data()
    remove_idle_users()

In [None]:
def filter_by_playtime(user_id, playtime):
    user_data = {}
    
    for user in users_items:
        if user["user_id"]==user_id:
            user_data = user
            break
    
    filtered_df = json_normalize(user_data, "items", ["user_id", "steam_id", "user_url"])
    filtered_df = filtered_df[filtered_df["playtime_forever"] > playtime]
    
    return filtered_df

In [None]:
def extract_genres_data_from_bundles(): 
    
    clean_bundle_df = clean_bundle_data()
    
    filter_genre_from_bundle = {}
    for id_ in clean_bundle_df["bundle_id"].unique():
        words = ""
        for genres in clean_bundle_df[clean_bundle_df["bundle_id"]==id_]["genre"]:
            for genre in genres.split(","):
                if genre not in words:
                    words = words + genre + " "
        filter_genre_from_bundle[id_] = words[:-1]
        
    return filter_genre_from_bundle

In [None]:
def group_generation(user_id, num_of_users):
    user_data = {}
    
    for user in users_items:
        if user["user_id"]==user_id:
            user_data = user
            break
            
    filtered_df = json_normalize(user_data, "items", ["user_id", "steam_id", "user_url"])
    filtered_df = filtered_df[filtered_df["playtime_forever"] > 100]
    
    genres = get_all_genres_by_user(filtered_df)
    top_rated_genres = parse_genres(genres)
#     print(genres)
#     print(top_rated_genres)
    sim = {}
    ug = word_tokenize(top_rated_genres)
    counter = 0
    
    for user in users_items:
        if user["user_id"]!=user_id:
            user_df = json_normalize(user, "items", ["user_id", "steam_id", "user_url"])
            g = get_all_genres_by_user(user_df)
            top_genres = parse_genres(g)
            tokens = word_tokenize(top_genres)
            
            lst1 = []
            lst2 = []
            cos = 0
            vector = list(set().union(ug, tokens))
            for i in vector:
                if i in ug:
                    lst1.append(1)
                else:
                    lst1.append(0)

                if i in tokens:
                    lst2.append(1)
                else:
                    lst2.append(0)
                cos += lst1[-1]*lst2[-1]
            
            if float((sum(lst1)*sum(lst2))**0.5)==0:
                continue
                
            cos = cos / float((sum(lst1)*sum(lst2))**0.5)
            if cos >= 0.7:
                counter += 1
                sim[user["user_id"]] = cos
                
            if counter == num_of_users:
                break
                    
                
    return sorted(sim.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
user_group = group_generation('76561197970982479', 100)

In [None]:
user_group

In [None]:
perform_data_preprocessing()

In [None]:
steam_games_df.head()

In [None]:
all_bundles_genres_data = extract_genres_data_from_bundles()

In [None]:
filtered_playtime_df = filter_by_playtime('76561197970982479', 30)

# Recommend bundle to user based on most played genres

In [None]:
def get_all_genres_by_user(user):
    genre_count = {}
    for index, row in user.iterrows():
        if steam_games_df[steam_games_df["id"]==row["item_id"]].index.any()==True:
            game_genres = steam_games_df[steam_games_df["id"]==row["item_id"]]["genres"].values[0]
            for genre in game_genres:
                if genre not in genre_count:
                    genre_count[genre] = 1
                else:
                    genre_count[genre] = genre_count[genre] + 1

    return sorted(genre_count.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
def parse_genres(genres):
    g = ""
    for genre in genres[:math.ceil(len(genres)*.75)]:
        g = g + genre[0] + " "
    return g[:-1]

In [None]:
def get_bundle_similarity_scores(user_genres, all_bundles_genres_data):
    sim = {}
    ug = word_tokenize(user_genres)
    
    for id_ in list(all_bundles_genres_data.keys()):
        bundle_genres = word_tokenize(all_bundles_genres_data[id_])

        lst1 = []
        lst2 = []
        cos = 0
        vector = list(set().union(ug, bundle_genres))
        for i in vector:
            if i in ug:
                lst1.append(1)
            else:
                lst1.append(0)
                
            if i in bundle_genres:
                lst2.append(1)
            else:
                lst2.append(0)
            cos += lst1[-1]*lst2[-1]

        sim[id_] = cos / float((sum(lst1)*sum(lst2))**0.5)
    
    return sorted(sim.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
all_user_genres = get_all_genres_by_user(filtered_playtime_df)

In [None]:
top_rated_genres = parse_genres(all_user_genres)

In [None]:
sim = get_bundle_similarity_scores(top_rated_genres, all_bundles_genres_data)

In [None]:
sim

In [None]:
def get_bundle_recommendations(user_id, num_recommendations, sim, bundle_data=bundle_data):
    
    recommendations = []
    for id_ in sim[:num_recommendations]:
        for bundle in bundle_data:
            if bundle["bundle_id"]==id_[0]:
                recommendations.append(bundle)
                break
    
    return recommendations

In [None]:
recommendations = get_bundle_recommendations('76561197970982479', 3, sim)

In [None]:
recommendations