In [1]:
import pandas as pd
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:

DATA_DIR = Path("../data/steam")
PREPRO_WORK_DIR = DATA_DIR / "preprocessed"

PIVOT_USERS_GAMES_RECSCORE_DF_PATH = PREPRO_WORK_DIR / "pivot_users_games_recscore_df.pkl"
GAMES_DATA_PATH = DATA_DIR / "steam_games.df.pkl"

In [3]:
pivot_users_games_df = pd.read_pickle(PIVOT_USERS_GAMES_RECSCORE_DF_PATH)
pivot_users_games_df

item_id,10,100,10000,1002,100400,100410,10080,10090,100970,10100,...,9970,99700,9980,99810,99830,99890,9990,99900,99910,99920
steam_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561197960304530,165023.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,161700.0,...,-1.0,161700.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960493731,40514.0,-1.0,40500.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,40523.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960540939,110300.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,110300.0,-1.0,110300.0,...,-1.0,110300.0,-1.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960889906,59036.0,59000.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,59019.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197961040696,41206.0,41200.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76561198280059944,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198295803313,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1101.0,-1.0,-1.0
76561198297906261,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198299095634,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [4]:
unknown_user_per_game_scores = -np.ones_like(pivot_users_games_df.values[0])
unknown_user_per_game_scores

array([-1., -1., -1., ..., -1., -1., -1.])

In [5]:
pivot_users_games_df.loc["unknown_user"] = unknown_user_per_game_scores
pivot_users_games_df

item_id,10,100,10000,1002,100400,100410,10080,10090,100970,10100,...,9970,99700,9980,99810,99830,99890,9990,99900,99910,99920
steam_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561197960304530,165023.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,161700.0,...,-1.0,161700.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960493731,40514.0,-1.0,40500.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,40523.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960540939,110300.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,110300.0,-1.0,110300.0,...,-1.0,110300.0,-1.0,110300.0,110300.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197960889906,59036.0,59000.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,59019.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561197961040696,41206.0,41200.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76561198295803313,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1101.0,-1.0,-1.0
76561198297906261,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198299095634,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
76561198308665434,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [6]:
# convert dataframe to scipy sparse matrix
mat_users_games = csr_matrix(pivot_users_games_df.values)
mat_users_games

<2958x9750 sparse matrix of type '<class 'numpy.float64'>'
	with 28840500 stored elements in Compressed Sparse Row format>

In [7]:
def find_n_neighbours(target_user_index, mat_users_games, n_neighbors):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(mat_users_games)

    distances, indices = model_knn.kneighbors(
        mat_users_games[target_user_index],
        n_neighbors=n_neighbors+1)

    raw_recommends = \
        sorted(
            list(
                zip(
                    indices.squeeze().tolist(),
                    distances.squeeze().tolist()
                )
            ),
            key=lambda x: x[1]
        )[:0:-1]

    sorted_indices = [x[0] for x in raw_recommends]

    return raw_recommends, sorted_indices


def get_user_steam_id(user_index, pivot_users_games_df):
    return pivot_users_games_df.iloc[user_index].name

def get_users_steam_ids(user_indices, pivot_users_games_df):
    return pivot_users_games_df.iloc[user_indices].index.to_numpy()

In [8]:
raw_recommends, nneigbours_indices = find_n_neighbours(0, mat_users_games, 10)
print(f"{raw_recommends=}\n{nneigbours_indices=}")

raw_recommends=[(1217, 0.513711863970491), (16, 0.5118186483272142), (298, 0.5092433945689179), (2836, 0.5011008343752633), (1157, 0.49836643896378785), (70, 0.4887692173905904), (489, 0.48483456969201755), (236, 0.47415916395174174), (89, 0.467306774799659), (37, 0.4300162169348989)]
nneigbours_indices=[1217, 16, 298, 2836, 1157, 70, 489, 236, 89, 37]


In [9]:
neigbours_steam_ids = get_users_steam_ids(nneigbours_indices, pivot_users_games_df)
neigbours_steam_ids

array(['76561198058715603', '76561197966364980', '76561198018217670',
       '76561198130905342', '76561198057106658', '76561197981203305',
       '76561198034182115', '76561198010776633', '76561197990408423',
       '76561197971666535'], dtype=object)

In [10]:
user_per_game_scores = pivot_users_games_df.iloc[1217].to_numpy()
user_per_game_scores

array([-1., -1., -1., ..., -1., -1., -1.])

In [11]:
import numpy as np

def find_recomendation_indices(user_per_game_scores, n=20):
    arr = np.array(user_per_game_scores)
    indices = np.argpartition(arr, -n)[-n:]
    largest_numbers = arr[indices]
    rec_game_score_pairs = [(num, idx) for num, idx in zip(largest_numbers, indices)]
    rec_game_score_pairs.sort(key=lambda x: x[0], reverse=True)
    return rec_game_score_pairs

games_recomendation = find_recomendation_indices(user_per_game_scores, 10)
games_recomendation

[(104276.0, 9478),
 (91422.0, 694),
 (88319.0, 9540),
 (85991.0, 464),
 (84999.0, 54),
 (84486.0, 5975),
 (83830.0, 764),
 (82792.0, 8843),
 (82748.0, 1257),
 (82583.0, 1838)]

In [12]:
pivot_users_games_df.columns[0]

'10'

In [13]:
games_df = pd.read_pickle(GAMES_DATA_PATH)
games_df.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,


In [14]:
def get_game_names_by_item_ids(game_item_ids, games_df):
    return games_df[games_df["id"].isin(game_item_ids)]["app_name"].to_numpy()

def get_game_item_id_by_index(game_index, pivot_users_games_df):
    return pivot_users_games_df.columns[game_index]

game_item_id = get_game_item_id_by_index(0, pivot_users_games_df)
game_names = get_game_names_by_item_ids([game_item_id], games_df)

game_item_id, game_names

('10', array(['Counter-Strike'], dtype=object))

In [15]:
def recommend_games(target_user_steam_id, n_neighbors, n_games):
    """Collaborative filtering"""

    # TODO  add a check to see if the user already has a recommended game

    global pivot_users_games_df, games_df

    if target_user_steam_id not in pivot_users_games_df.index:
        unknown_user_per_game_scores = -np.ones_like(pivot_users_games_df.values[0])
        pivot_users_games_df.loc["unknown_user"] = unknown_user_per_game_scores
        target_user_index = pivot_users_games_df.index.get_loc("unknown_user")
    else:
        target_user_index = pivot_users_games_df.index.get_loc(target_user_steam_id)

    mat_users_games = csr_matrix(pivot_users_games_df.values)

    _, neighbours_indices = find_n_neighbours(target_user_index, mat_users_games, n_neighbors)

    all_neighbours_best_recomendations = []
    for n in range(n_neighbors):
        neighbor_game_list = pivot_users_games_df.iloc[neighbours_indices[n]].to_list()
        neighbor_best_recomendations = find_recomendation_indices(neighbor_game_list, n=n_neighbors)
        all_neighbours_best_recomendations.extend(neighbor_best_recomendations)

    # sort by score, top games first, descending
    all_neighbours_best_recomendations = sorted(all_neighbours_best_recomendations, key=lambda x: x[0], reverse=True)

    recommended_games_item_ids = [
        get_game_item_id_by_index(game_index, pivot_users_games_df)
        for _, game_index in all_neighbours_best_recomendations[:n_games+1]
    ]
    recommend_games_names = get_game_names_by_item_ids(recommended_games_item_ids, games_df)
    return recommend_games_names

recommended_games_names = recommend_games("76561197960304530", n_neighbors=50, n_games=10)
print(
    f"{recommended_games_names.tolist()=}\n"
    f"{recommended_games_names.shape=}\n"
    f"{np.unique(recommended_games_names).shape=}"
)

recommended_games_names.tolist()=['Fallout: New Vegas', "Deus Ex: Human Revolution - Director's Cut", 'Far Cry® 4', 'Street Fighter V', 'Rise of the Tomb Raider™', 'Steel Rain', 'Fallout 4', 'Grand Theft Auto V', 'Borderlands: The Pre-Sequel', 'Wolfenstein: The New Order']
recommended_games_names.shape=(10,)
np.unique(recommended_games_names).shape=(10,)


In [16]:
user_index = np.random.randint(0, 2957)
target_user_steam_id = get_user_steam_id(user_index, pivot_users_games_df)
recommended_games_names = recommend_games(target_user_steam_id, n_neighbors=50, n_games=10)
print(
    f" Recommended games for {target_user_steam_id=} ".center(50, "=") + "\n\n"
    f"{recommended_games_names.tolist()=}\n"
    f"{recommended_games_names.shape=}\n"
    f"{np.unique(recommended_games_names).shape=}"
)

assert np.unique(recommended_games_names).shape == recommended_games_names.shape

 Recommended games for target_user_steam_id='76561198070879876' 

recommended_games_names.tolist()=["Garry's Mod", 'Counter-Strike: Global Offensive', 'Arma 3', 'Warframe']
recommended_games_names.shape=(4,)
np.unique(recommended_games_names).shape=(4,)


In [17]:
user_index = 2243
target_user_steam_id = get_user_steam_id(user_index, pivot_users_games_df)
recommended_games_names = recommend_games(target_user_steam_id, n_neighbors=100, n_games=10)
print(
    f" Recommended games for {target_user_steam_id=} ".center(50, "=") + "\n\n"
    f"{recommended_games_names.tolist()=}\n"
    f"{recommended_games_names.shape=}\n"
    f"unique_games={pd.Series.nunique(pd.Series(recommended_games_names))}"
)

 Recommended games for target_user_steam_id='76561198083522714' 

recommended_games_names.tolist()=["Garry's Mod", 'Counter-Strike: Global Offensive', 'Elsword']
recommended_games_names.shape=(3,)
unique_games=3


In [18]:
user_index = 2243
target_user_steam_id = get_user_steam_id(user_index, pivot_users_games_df)
recommended_games_names = recommend_games(target_user_steam_id, n_neighbors=5, n_games=15)
print(
    f" Recommended games for {target_user_steam_id=} ".center(50, "=") + "\n\n"
    f"{recommended_games_names.tolist()=}\n"
    f"{recommended_games_names.shape=}\n"
    f"unique_games={pd.Series.nunique(pd.Series(recommended_games_names))}"
)

 Recommended games for target_user_steam_id='76561198083522714' 

recommended_games_names.tolist()=["Garry's Mod", 'The Elder Scrolls IV: Oblivion® Game of the Year Edition', 'Spiral Knights', 'Counter-Strike: Global Offensive', 'MachineCraft', 'Robocraft', 'No More Room in Hell', 'PAYDAY 2', 'Left 4 Dead 2']
recommended_games_names.shape=(9,)
unique_games=9


In [19]:
target_user_steam_id = "daria123"
recommended_games_names = recommend_games(target_user_steam_id, n_neighbors=5, n_games=15)
print(
    f" Recommended games for {target_user_steam_id=} ".center(50, "=") + "\n\n"
    f"{recommended_games_names.tolist()=}\n"
    f"{recommended_games_names.shape=}\n"
    f"unique_games={pd.Series.nunique(pd.Series(recommended_games_names))}"
)

 Recommended games for target_user_steam_id='daria123' 

recommended_games_names.tolist()=["Garry's Mod", 'Counter-Strike: Global Offensive', 'Clicker Heroes', 'Heroes & Generals', 'Unturned', "Five Nights at Freddy's 3", "Five Nights at Freddy's", 'Goat Simulator', 'Scribblenauts Unlimited', 'Terraria', 'Trackmania United Forever Star Edition', 'Half-Life: Opposing Force']
recommended_games_names.shape=(12,)
unique_games=12
