In [1]:
import pandas as pd
from pathlib import Path

In [2]:
SEED = 2302

DATA_DIR = Path("../data/steam")
PREPRO_WORK_DIR = DATA_DIR / "preprocessed"

PIVOT_USERS_GAMES_RECSCORE_DF_PATH = PREPRO_WORK_DIR / "pivot_users_games_recscore_df.pkl"

In [3]:
df = pd.read_pickle(PIVOT_USERS_GAMES_RECSCORE_DF_PATH)
df

item_id,10,100,10000,1002,100400,100410,10080,10090,100970,10100,...,9970,99700,9980,99810,99830,99890,9990,99900,99910,99920
steam_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561197960304530,165023.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,161700.0,...,-1.0,161700.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960493731,40514.0,-1.0,40500.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,40523.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960540939,110300.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,110300.0,-1.0,110300.0,...,-1.0,110300.0,-1.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960889906,59036.0,59000.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,59019.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197961040696,41206.0,41200.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76561198280059944,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198295803313,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1101.0,-1.0,-1.0
76561198297906261,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198299095634,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [4]:
import numpy as np
from tqdm.notebook import tqdm
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


def zero_n_random_game_scores(df: pd.DataFrame, frac: float = 0.1):
    new_df = df.copy()
    # df.index must be a user's `steam_id`
    user_steam_ids = new_df.index.to_numpy()
    for user_steam_id in tqdm(user_steam_ids):
        owned_games = new_df.loc[user_steam_id, :] >= 0
        owned_games_ids = new_df.loc[user_steam_id, owned_games].index.to_numpy()
        if not owned_games_ids.any():
            print(f"{user_steam_id=}\n\n{owned_games_ids=}\n\n\n")
            continue
        n_games = int(owned_games_ids.shape[0] * frac)
        frac_random_owned_games_ids = np.random.choice(owned_games_ids, size=n_games, replace=False)
        new_df.loc[user_steam_id, frac_random_owned_games_ids] = -1
    return new_df

In [5]:
TEST_REMOVAL_FRAC = 0.1

train_df, test_df = df.copy(), zero_n_random_game_scores(df.copy(), frac=TEST_REMOVAL_FRAC)
train_df.shape, test_df.shape

  0%|          | 0/2957 [00:00<?, ?it/s]

((2957, 9750), (2957, 9750))

In [10]:
sample_user_steam_id = np.random.choice(df.index.to_numpy(), size=1, replace=False)[0]
sample_user_games_in_test_set = (test_df.loc[sample_user_steam_id, :] != -1).sum()
sample_user_games_in_train_set = (train_df.loc[sample_user_steam_id, :] != -1).sum()

assert round(sample_user_games_in_test_set / sample_user_games_in_train_set, 1) == 1 - TEST_REMOVAL_FRAC, 1

In [None]:
train_df.to_pickle(PIVOT_USERS_GAMES_RECSCORE_DF_PATH.as_posix().replace(".pkl", "_train.pkl"))
train_df

In [None]:
test_df.to_pickle(PIVOT_USERS_GAMES_RECSCORE_DF_PATH.as_posix().replace(".pkl", "_test.pkl"))
test_df