In [6]:
# Commment for Mumu 
# This block will read User Item interaction Json
# Attention, till now the Id is still in the raw format, which means it is still the steam format id
# Our goal is to map the User/User/Bundle into its range of size.
# Like say a table is like 
# | UserId | ItemId |
# | 1121241414124 | 10 |
# | 1121241414124 | 20 |
# | 1121241414123 | 10 |
# | 1121241414122 | 30 |
# | 1121241414122 | 20 | 

# it will become 
# | UserId | ItemId |
# | 1 | 1 |
# | 1 | 2 |
# | 2 | 1 |
# | 3 | 3 |
# | 3 | 2 | 


import pandas as pd
import json
aus_user_item = pd.read_csv("aus_user_item.csv",index_col= [0])
aus_user_item = aus_user_item.reset_index()
aus_user_item = aus_user_item[["steam_id","item_id"]]
aus_user_item["steam_id"] = aus_user_item["steam_id"].astype(str)
aus_user_item["item_id"] = aus_user_item["item_id"].astype(str)
aus_user_item = aus_user_item.drop_duplicates()

In [7]:
raw_bundle_item = pd.read_json("../data/bundle_data.json")
raw_bundle_item = raw_bundle_item.explode("items")
raw_bundle_item = raw_bundle_item.reset_index()
raw_bundle_item["item_id"] = raw_bundle_item.apply(lambda x: x["items"]["item_id"],axis=1)
raw_bundle_item = raw_bundle_item[["bundle_id","item_id"]]

In [27]:
complex_item_data = raw_bundle_item[raw_bundle_item["item_id"].str.contains(",")].copy()
print(len(complex_item_data))
new_item_data = []
for index,row in complex_item_data.iterrows():
    item_id_list = row["item_id"].split(",")
    for item_id in item_id_list:
        new_row = row.copy()
        new_row["item_id"] = item_id
        new_item_data.append(new_row)
new_item_data_df = pd.DataFrame(new_item_data)
item_data_df = raw_bundle_item.drop(raw_bundle_item[raw_bundle_item["item_id"].str.contains(",")].index)
item_data_df = pd.concat([item_data_df, new_item_data_df])
item_data_df = item_data_df.drop_duplicates()

14


In [163]:
# In this block we get the bundle item mapping
# Attention, in the previous user item mapping,
# we have map the item_id from its actual ID(10,20,30) into their sequence id, 
# we will use the seq id of item for bundle_item as well
# but at this point we haven't done it. 
# We now have the actual bundle id and item id 
bundle_item = item_data_df[["bundle_id", "item_id"]]
bundle_item['bundle_id'] = bundle_item['bundle_id'].astype(str)
bundle_item['item_id'] = bundle_item['item_id'].astype(str)
item_data_df["item_id"] = item_data_df["item_id"].astype(str)


In [164]:

# To create user bundle map 
# Attention! I cannot find the direct user bundle interaction data, so I used the merge. 
# Question! 
# We can keep only 1 user_bundle interaction record, if it's merged on the same item id
# E.G.
# user | item_id | bundle_id
# 76561197970982480 | 30 | 232
# 76561197970982480 | 30 | 235 # considered as duplicated
# 76561197970982480 | 300 | 232
# 76561197970982480 | 300 | 240 # considered as duplicated

# we can drop the 2 lines 
# The remain will be like 
# user | item_id | bundle_id
# 76561197970982480 | 30 | 232
# 76561197970982480 | 300 | 232
user_bundle = pd.merge(aus_user_item, bundle_item, how='left', on = ["item_id"])
user_bundle = user_bundle.drop_duplicates(["steam_id","item_id"])
user_bundle = user_bundle.dropna()
user_bundle = user_bundle[["steam_id", "bundle_id"]]
user_bundle = user_bundle.drop_duplicates()
user_bundle['steam_id'] = user_bundle['steam_id'].astype(str)
user_bundle["bundle_id"] = user_bundle["bundle_id"].astype(str)
user_bundle["bundle_id"] = user_bundle["bundle_id"].apply(lambda x: x.replace(".0",""))


In [165]:
# In this section we will have the actual Id map to seq_id map! 

from collections import defaultdict
user_mapping = defaultdict(str)
bundle_mapping = defaultdict(str)
item_mapping = defaultdict(str)

user_id_list = sorted(list(set(list(aus_user_item.steam_id.unique()) + list(user_bundle.steam_id.unique()))))
bundle_id_list = sorted(list(set(list(user_bundle.bundle_id.unique()) + list(bundle_item.bundle_id.unique()))))
item_id_list = sorted(list(set(list(aus_user_item.item_id.unique()) + list(bundle_item.item_id.unique()))))

for index, uid in enumerate(user_id_list):
    user_mapping[uid] = str(index)
for index, bid in enumerate(bundle_id_list):
    bundle_mapping[bid] = str(index)
for index, iid in enumerate(item_id_list):
    item_mapping[iid] = str(index)

In [166]:
# Saving the mapping, and then we can use it for calculating the score. 
user_mapping_json = json.dumps(user_mapping)
with open("user_mapping_json.json",'w') as f:
    f.write(user_mapping_json)
    
bundle_mapping_json = json.dumps(bundle_mapping)
with open("bundle_mapping_json.json",'w') as f:
    f.write(bundle_mapping_json)

item_mapping_json = json.dumps(item_mapping)
with open("item_mapping_json.json",'w') as f:
    f.write(item_mapping_json)

In [167]:
# TO MAP the ID 
user_bundle['steam_id'] = user_bundle['steam_id'].apply(lambda x: user_mapping[x])
user_bundle['bundle_id'] = user_bundle['bundle_id'].apply(lambda x: bundle_mapping[x])
bundle_item['bundle_id'] = bundle_item['bundle_id'].apply(lambda x: bundle_mapping[x])
bundle_item['item_id'] = bundle_item['item_id'].apply(lambda x: item_mapping[x])
aus_user_item['steam_id'] = aus_user_item['steam_id'].apply(lambda x: user_mapping[x])
aus_user_item['item_id'] = aus_user_item['item_id'].apply(lambda x: item_mapping[x])

In [170]:
user_bundle.to_csv("user_bundle.txt", sep='\t', header = False, index = False)
bundle_item.to_csv("bundle_item.txt",sep='\t', header=False, index=False)
aus_user_item.to_csv("user_item.txt",sep='\t', header=False, index=False)

In [171]:
print(len(user_id_list))
print(len(bundle_id_list))
print(len(item_id_list))

70912
615
12386


In [173]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
# user_bundle_train = user_bundle.sample(frac=0.75) # This part can be used in different ways 
user_bundle_train = user_bundle.groupby('steam_id').apply(lambda x: x.sample(frac=0.75, random_state=1)) # using this will select user at least once
user_bundle_train.to_csv("user_bundle_train.txt", sep='\t', header = False, index = False)

In [174]:
user_bundle_remain = pd.concat([user_bundle_train, user_bundle]).drop_duplicates(keep=False)
user_bundle_test = user_bundle_remain.sample(frac=0.6)
user_bundle_tuning = pd.concat([user_bundle_remain, user_bundle_test]).drop_duplicates(keep=False)
user_bundle_test.to_csv("user_bundle_test.txt", sep='\t', header=False, index=False)
user_bundle_tuning.to_csv("user_bundle_tune.txt", sep='\t', header=False, index=False)

In [180]:
user_bundle = user_bundle.reset_index()

Unnamed: 0,0,201
0,0,510
1,0,172
2,0,261
3,0,439
4,0,28
...,...,...
458608,9998,495
458609,9999,194
458610,9999,598
458611,9999,495
