In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
rec_data = pd.read_csv("../data/recommendations.csv")
users_data = pd.read_csv("../data/users.csv")
games_data = pd.read_csv("../data/games.csv")
metadata = pd.read_json("../data/games_metadata.json", lines = True)

Convert data into both implicit and explicit rating

In [10]:
hours_intervals = [(0.0, 2.0),(2.0,6.0) ,(6.0, 14.1), (14.1, 39.7),(39.7,float('inf'))]  
ratings = [2,2.5,3,3.5,4]  
def assign_rating(hours, is_recommended):
    for i, (start, end) in enumerate(hours_intervals):
        if start <= hours < end:
            return ratings[i] + 2 * is_recommended - 1
    return ratings[-1] + 2* is_recommended - 1  

rec_data['explicit_rating'] = rec_data.apply(lambda row: assign_rating(row['hours'], row['is_recommended']), axis=1)
renamed_data = rec_data.rename(columns = {'is_recommended':'implicit_rating'})
ratings_data = renamed_data[['user_id', 'app_id','explicit_rating','implicit_rating']].copy()
ratings_data.to_csv("../preprocessed_data/ratings.csv",index = False)

Create games_features and users_features with one-hot encoder

Games_features: app_id genre, price, rating

In [11]:
# create genre columns for games data
metadata = metadata[metadata.app_id.isin(ratings_data.app_id)]
features = ['app_id','Atmospheric', 'RPG', 'Strategy', '2D', 'Simulation', 'Casual', 'Adventure', 'Action', 'Singleplayer', 'Indie']
games_features = pd.DataFrame(columns = features)
for t in features:
    games_features[t] = metadata['tags'].apply(lambda x : t in x)
games_features['app_id'] = metadata['app_id']

In [12]:
# create price columns for games data
games_data = games_data[games_data.app_id.isin(ratings_data.app_id)]
price_intervals = [(0.0, 0.99),(0.99,3.99) ,(3.99, 6.99), (6.99, 9.99),(9.99,12.814),(12.814,14.99),(14.99,17.34),(17.34,19.99),(19.99,23.26),(23.26,float('inf'))]
for interval in price_intervals:
    games_features[str(interval[0])+'-'+str(interval[1])] = games_data['price_final'].apply(lambda x : (x>=interval[0] and x<interval[1]))

In [13]:
# create ratings columns for games data
games_rating =  ['Overwhelmingly Positive','Very Positive','Positive','Mostly Positive','Mixed','Mostly Negative','Negative','Very Negative','Overwhelmingly Negative']
for rating in games_rating:
    games_features[rating] = games_data['rating'].apply(lambda x : x == rating )

In [14]:
games_features

Unnamed: 0,app_id,Atmospheric,RPG,Strategy,2D,Simulation,Casual,Adventure,Action,Singleplayer,...,23.26-inf,Overwhelmingly Positive,Very Positive,Positive,Mostly Positive,Mixed,Mostly Negative,Negative,Very Negative,Overwhelmingly Negative
0,13500,True,False,False,False,False,False,True,True,True,...,False,False,True,False,False,False,False,False,False,False
2,113020,False,False,True,True,False,True,True,True,True,...,False,False,True,False,False,False,False,False,False,False
3,226560,True,False,False,False,False,False,True,True,True,...,False,False,False,False,False,True,False,False,False,False
4,249050,False,True,True,True,False,False,True,False,True,...,False,False,True,False,False,False,False,False,False,False
5,250180,False,False,False,True,False,False,False,True,True,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50783,632470,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
50786,1599660,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
50787,250900,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
50788,920210,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False


In [15]:
games_features.to_csv("../preprocessed_data/games_features.csv",index = False)

Users features: user_id, products, price

In [17]:
products_intervals = ['user_id',
                        (3.0, 197.0),
                        (197.0, 324.0), 
                        (324.0, 513.0), 
                        (513.0, 936.0),
                        (936.0, float('inf'))]

users_features = pd.DataFrame()
users_features['user_id'] = users_data['user_id']
for interval in products_intervals[1:]:
    users_features[str(interval[0])+'-'+str(interval[1])] = users_data['products'].apply(lambda x : (x>=interval[0] and x<interval[1]))


In [18]:
merged_data = pd.merge(rec_data, games_data, on='app_id', how='left')
average_price_per_user = merged_data.groupby('user_id')['price_final'].mean().reset_index()
users_price = pd.merge(users_data, average_price_per_user, on='user_id', how='left')
users_price.rename(columns = {"price_final" :"price_average"}, inplace = True)

In [19]:
for interval in price_intervals:
    users_features[str(interval[0])+'-'+str(interval[1])] = users_price['price_average'].apply(lambda x : (x>=interval[0] and x<interval[1]))

In [20]:
users_features

Unnamed: 0,user_id,3.0-197.0,197.0-324.0,324.0-513.0,513.0-936.0,936.0-inf,0.0-0.99,0.99-3.99,3.99-6.99,6.99-9.99,9.99-12.814,12.814-14.99,14.99-17.34,17.34-19.99,19.99-23.26,23.26-inf
0,11316351,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
1,4363012,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
2,4893896,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
3,6366584,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
4,11140739,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57968,2019266,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
57969,2077327,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
57970,2820915,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False
57971,3954523,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [21]:
users_features.to_csv("../preprocessed_data/users_features.csv", index = False)