In [136]:

%pip install pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load libraries and datasets

In [137]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [138]:
players = pd.read_csv('./datasets/players.csv')
purchased_games = pd.read_csv("./datasets/purchased_games.csv")
games = pd.read_csv("./datasets/games.csv")
history = pd.read_csv("./datasets/history.csv")
prices = pd.read_csv("./datasets/prices.csv")

In [None]:
# print(players.isnull().sum())
# print(games.isnull().sum())
# print(purchased_games.isnull().sum())
# print(history.isnull().sum())
# print(prices.isnull().sum())

playerid         0
country     177868
created      47669
dtype: int64
gameid                    0
title                     3
developers             5559
publishers             5941
genres                 5549
supported_languages    5506
release_date              0
dtype: int64
playerid        0
library     55607
dtype: int64
playerid         0
achievementid    0
date_acquired    0
dtype: int64
gameid                 0
usd               907148
eur              1703086
gbp               908184
jpy               919973
rub              1006291
date_acquired          0
dtype: int64


# Clean data

## Remove duplicates

In [140]:
players = players.drop_duplicates()
purchased_games = purchased_games.drop_duplicates()
games = games.drop_duplicates()
history = history.drop_duplicates()
prices = prices.drop_duplicates()

## Standardize date format

In [141]:
players['created'] = pd.to_datetime(players['created'])
games['release_date'] = pd.to_datetime(games['release_date'])
history['date_acquired'] = pd.to_datetime(history['date_acquired'])
prices['date_acquired'] = pd.to_datetime(prices['date_acquired'])

# Handle missing data

## players.csv

In [142]:
players.fillna({'country': 'Unknown'}, inplace=True)

players.fillna({'created': players['created'].median()}, inplace=True)

print(players.isnull().sum())

playerid    0
country     0
created     0
dtype: int64


## games.csv

In [143]:
games.dropna(subset=['title'], inplace=True)

games.fillna({'developers': 'Unknown'}, inplace=True)
games.fillna({'publishers': 'Unknown'}, inplace=True)
games.fillna({'genres': 'Unknown'}, inplace=True)

games.fillna({'supported_languages': 'English'}, inplace=True)

print(games.isnull().sum())

gameid                 0
title                  0
developers             0
publishers             0
genres                 0
supported_languages    0
release_date           0
dtype: int64


## purchased_games.csv

In [144]:
purchased_games.fillna({'library': '[]'}, inplace=True)

print(purchased_games.isnull().sum())

playerid    0
library     0
dtype: int64


## history.csv (no null values)

## prices.csv

In [145]:
latest_prices = prices.sort_values("date_acquired").groupby("gameid").last().reset_index()

In [146]:
exchange_rates = {
    "usd": 1.0,
    "eur": 0.93,
    "gbp": 0.77,
    "jpy": 150.75,
    "rub": 84.51
}

def convert_to_usd(row):
    prices_in_usd = [row[currency] / exchange_rates[currency] for currency in exchange_rates if not pd.isna(row[currency])]
    return sum(prices_in_usd) / len(prices_in_usd) if prices_in_usd else None

latest_prices = prices.sort_values("date_acquired").groupby("gameid").last().reset_index()

# Create a new column with the converted USD price
latest_prices["price_usd"] = latest_prices.apply(convert_to_usd, axis=1)

# Drop old currency columns
latest_prices = latest_prices.drop(columns=["usd", "eur", "gbp", "jpy", "rub"])

In [147]:
# replace gameid with NaN price with 0 (free game or not released yet)
latest_prices.fillna({'price_usd': 0}, inplace=True)
latest_prices.head

<bound method NDFrame.head of         gameid date_acquired  price_usd
0           10    2025-02-24   7.579735
1           20    2025-02-24   4.428207
2           30    2025-02-24   4.428207
3           40    2025-02-24   4.428207
4           50    2025-02-24   4.428207
...        ...           ...        ...
98460  3437800    2025-02-24   0.000000
98461  3437960    2025-02-24   0.000000
98462  3441030    2025-02-24   0.000000
98463  3441170    2025-02-24   0.000000
98464  3441890    2025-02-24   0.000000

[98465 rows x 3 columns]>

In [None]:
# latest_prices.head()

Unnamed: 0,gameid,date_acquired,price_usd
0,10,2025-02-24,7.579735
1,20,2025-02-24,4.428207
2,30,2025-02-24,4.428207
3,40,2025-02-24,4.428207
4,50,2025-02-24,4.428207


In [None]:
# get released_date from games has id=266410
# gameid = 340
# games[games['gameid'] == gameid]
# latest_prices[latest_prices['gameid'] == gameid]

Unnamed: 0,gameid,date_acquired,price_usd
15,340,2025-02-24,0.0


# Feature engineering

## Create User-Game Interaction matrix

In [None]:
purchased_games["library"] = purchased_games["library"].apply(eval)

In [176]:
purchased_games

Unnamed: 0,playerid,library
0,76561198060698936,"[60, 1670, 3830, 1600, 2900, 2910, 2920, 4800,..."
1,76561198287452552,"[10, 80, 100, 240, 2990, 6880, 6910, 6920, 698..."
2,76561198040436563,"[10, 80, 100, 300, 20, 30, 40, 50, 60, 70, 130..."
3,76561198042412488,"[300, 240, 220, 320, 360, 4300, 4800, 4000, 61..."
4,76561198119605821,"[47870, 108600, 550, 271590, 331470, 381210, 2..."
...,...,...
102543,76561199063275634,[]
102544,76561198003275888,"[3920, 2600, 6980, 4540, 4550, 7830, 22330, 22..."
102545,76561198944668572,[]
102546,76561198033563710,[]


In [163]:
games

Unnamed: 0,gameid,title,developers,publishers,genres,supported_languages,release_date
0,3281560,Horror Game To Play With Friends! Playtest,Unknown,Unknown,Unknown,English,2024-10-21
1,3280930,Eternals' Path Playtest,Unknown,Unknown,Unknown,English,2024-10-17
2,3280770,ANGST: A TALE OF SURVIVAL - Singleplayer Playtest,Unknown,Unknown,Unknown,English,2024-10-13
3,3279790,Montabi Playtest,Unknown,Unknown,Unknown,English,2024-10-13
4,3278320,파이팅걸 유리 Playtest,Unknown,Unknown,Unknown,English,2024-10-12
...,...,...,...,...,...,...,...
98243,1499520,Spiritwish,['NEONSTUDIO Corp.'],['SUBETE'],"['Adventure', 'Casual', 'Free To Play', 'Massi...",['Japanese'],2021-03-31
98244,1499540,リアルタイムバトル将棋オンライン,['株式会社シルバースタージャパン'],['株式会社シルバースタージャパン'],"['Action', 'Free To Play', 'Simulation', 'Stra...",['Japanese'],2021-11-02
98245,1499550,VR Luxury Life (Be a Billionaire),['William at Oxford'],['William at Oxford'],"['Casual', 'Simulation']","['English', 'French', 'Italian', 'German', 'Sp...",2020-12-29
98246,1498590,Fat Prisoner Simulator 3,['Kiddy'],['Kiddy'],"['Indie', 'Simulation', 'Sports']",['English'],2021-01-08


In [165]:
history

Unnamed: 0,playerid,achievementid,date_acquired
0,76561198220441373,403640_ACH_1,2019-12-18 15:33:43
1,76561198220441373,403640_ACH_2,2019-12-18 23:49:51
2,76561198220441373,403640_ACH_3,2019-12-19 23:05:07
3,76561198220441373,403640_ACH_4,2019-12-24 05:50:49
4,76561198220441373,403640_ACH_5,2023-04-19 22:39:36
...,...,...,...
10693874,76561198985765745,2567870_THETEMPLE_ACHIEVEMENT,2024-06-29 01:33:43
10693875,76561198985765745,2567870_THEASIANSHRINE_ACHIEVEMENT,2024-06-29 01:47:50
10693876,76561198985765745,2567870_THEDEITIES_ACHIEVEMENT,2024-06-29 02:07:34
10693877,76561198985765745,2567870_THEGARDEN_ACHIEVEMENT,2024-06-29 02:27:05


In [166]:
latest_prices

Unnamed: 0,gameid,date_acquired,price_usd
0,10,2025-02-24,7.579735
1,20,2025-02-24,4.428207
2,30,2025-02-24,4.428207
3,40,2025-02-24,4.428207
4,50,2025-02-24,4.428207
...,...,...,...
98460,3437800,2025-02-24,0.000000
98461,3437960,2025-02-24,0.000000
98462,3441030,2025-02-24,0.000000
98463,3441170,2025-02-24,0.000000


In [None]:
# purchased_games[purchased_games['playerid'] == 76561198287996067]['library']

Series([], Name: library, dtype: object)

In [168]:
achievements = pd.read_csv("./datasets/achievements.csv")

In [169]:
achievements

Unnamed: 0,achievementid,gameid,title,description
0,2621440_ACH_FIRST_KILL,2621440,FIRST KILL,You should kill ONE enemy.
1,2621440_ACH_0_LEVEL_COMPLETED,2621440,TUTORIAL COMPLETED,You should complete tutorial.
2,2621440_ACH_1_LEVEL_COMPLETED,2621440,FIRST LEVEL,You should complete first level
3,2621440_ACH_2_LEVEL_COMPLETED,2621440,SECOND LEVEL,You should complete second level
4,2621440_ACH_3_LEVEL_COMPLETED,2621440,THIRD LEVEL,You should complete third level
...,...,...,...,...
1939022,3399670_CHECKMATE,3399670,Checkmate!,A quick win.
1939023,3399670_SUNK,3399670,They have sunk into oblivion.,Throw 100 dummies into the abyss.
1939024,3399670_SHATTERED,3399670,Shattered!,Break all the cans
1939025,3399670_SMILE,3399670,Smile!,You are being filmed by a hidden camera.


In [170]:
friends = pd.read_csv("./datasets/friends.csv")

In [173]:
friends

Unnamed: 0,playerid,friends
0,76561198060422271,"['76561198018120276', '76561198034545417', '76..."
1,76561198113439786,"['76561198047435192', '76561198059136488', '76..."
2,76561198149851326,"['76561197991555589', '76561198003513187', '76..."
3,76561198296997371,
4,76561198895573082,"['76561197960300358', '76561197961330830', '76..."
...,...,...
424678,76561198136182808,
424679,76561198088853055,"['76561197960399877', '76561197960483676', '76..."
424680,76561197992827217,"['76561197964833893', '76561197968477959', '76..."
424681,76561198072957822,


In [177]:
# convert friends column to list of playerid, NaN to empty list
friends["friends"] = friends["friends"].apply(lambda x: eval(x) if not pd.isna(x) else [])


In [178]:
friends

Unnamed: 0,playerid,friends
0,76561198060422271,"[76561198018120276, 76561198034545417, 7656119..."
1,76561198113439786,"[76561198047435192, 76561198059136488, 7656119..."
2,76561198149851326,"[76561197991555589, 76561198003513187, 7656119..."
3,76561198296997371,[]
4,76561198895573082,"[76561197960300358, 76561197961330830, 7656119..."
...,...,...
424678,76561198136182808,[]
424679,76561198088853055,"[76561197960399877, 76561197960483676, 7656119..."
424680,76561197992827217,"[76561197964833893, 76561197968477959, 7656119..."
424681,76561198072957822,[]


In [None]:
# write dataset back to csv
import os
if not os.path.exists('./clean_datasets'):
    os.makedirs('./clean_datasets')

players.to_csv('./clean_datasets/players.csv', index=False)
games.to_csv('./clean_datasets/games.csv', index=False)
purchased_games.to_csv('./clean_datasets/purchased_games.csv', index=False)
history.to_csv('./clean_datasets/history.csv', index=False)
latest_prices.to_csv('./clean_datasets/prices.csv', index=False)
achievements.to_csv('./clean_datasets/achievements.csv', index=False)
friends.to_csv('./clean_datasets/friends.csv', index=False)

In [None]:
import re
def clean_library_string(library_str):
    library_str = library_str.strip()  # Xóa khoảng trắng hai bên
    if not library_str.startswith("["):
        library_str = "[" + library_str
    # Nếu thiếu dấu ']', thêm vào cuối
    if library_str.count("[") > library_str.count("]"):
        library_str += "]"
    
    # Nếu có dấu phẩy cuối cùng bị thừa (trường hợp "[1,2,3,")
    library_str = re.sub(r",\s*\]", "]", library_str)
    
    try:
        # Chuyển chuỗi thành danh sách số nguyên
        game_list = eval(library_str)
        if isinstance(game_list, list):
            return [int(game) for game in game_list if isinstance(game, (int, float))]  # Đảm bảo tất cả phần tử là số
        else:
            return []  # Nếu không phải list thì trả về danh sách rỗng
    except Exception as e:
        print(f"Lỗi khi xử lý chuỗi {library_str}: {e}")
        return []  # Trả về danh sách rỗng nếu có lỗi

# Dùng thử với một số dữ liệu lỗi
test_cases = [
    "[1, 2, 3]",    
    "[4, 5, 6,",     
    "7, 8, 9]",     
    "[10, 11, 12",   
    "abc",           
    "[13, 'x', 14]"
]

for test in test_cases:
    print(f"Input: {test} → Output: {clean_library_string(test)}")

In [None]:
purchased_game_cleaned = pd.read_csv("./clean_datasets/purchased_games.csv")  # (playerid, library)
purchased_game_cleaned ["library"] = purchased_game_cleaned["library"].apply(clean_library_string)
purchased_game_cleaned.to_csv('./clean_datasets/purchased_games.csv', index=False)