In [None]:
# Import & Load
import pandas as pd

# Mount Google Drive if data is stored there
from google.colab import drive
drive.mount('/content/drive')

# Load CSVs
players = pd.read_csv("/content/drive/MyDrive/datasets/players.csv")
games = pd.read_csv("/content/drive/MyDrive/datasets/games.csv")
purchased_games = pd.read_csv("/content/drive/MyDrive/datasets/purchased_games.csv")
reviews = pd.read_csv("/content/drive/MyDrive/datasets/reviews.csv")
prices = pd.read_csv("/content/drive/MyDrive/datasets/prices.csv")


Mounted at /content/drive


# Before and After Filtering

In [None]:
prices.head()

Unnamed: 0,gameid,usd,eur,gbp,jpy,rub,date_acquired
0,3281560,,,,,,2024-11-28
1,3280930,,,,,,2024-11-28
2,3280770,,,,,,2024-11-28
3,3279790,,,,,,2024-11-28
4,3278320,,,,,,2024-11-28


In [None]:
# Filter created later than Jan 2020

# Convert relevant date columns to datetime objects if not already done
players['created'] = pd.to_datetime(players['created'], errors='coerce')
games['release_date'] = pd.to_datetime(games['release_date'], errors='coerce')
reviews['posted'] = pd.to_datetime(reviews['posted'], errors='coerce')
prices['date_acquired'] = pd.to_datetime(prices['date_acquired'], errors='coerce')

# Define the cutoff date
cutoff_date = pd.to_datetime("2020-01-01")

# Filter each dataset
players_filtered = players[players['created'] >= cutoff_date].copy()
games_filtered = games[games['release_date'] >= cutoff_date].copy()
reviews_filtered = reviews[reviews['posted'] >= cutoff_date].copy()
prices_filtered = prices[prices['date_acquired'] >= cutoff_date].copy()

# Note: purchased_games does not have a date column to filter by.
# We will keep the original purchased_games dataset for now.
purchased_games_filtered = purchased_games.copy()

In [None]:
# print the row counts
print("Players Before:", players.shape[0])
print("Players After:", players_filtered.shape[0])
print("Games Before:", games.shape[0])
print("Games After:", games_filtered.shape[0])
print("Reviews Before:", reviews.shape[0])
print("Reviews After:", reviews_filtered.shape[0])
print("Prices Before:", prices.shape[0])
print("Prices After:", prices_filtered.shape[0])

Players Before: 424683
Players After: 43994
Games Before: 98248
Games After: 67201
Reviews Before: 1204534
Reviews After: 749832
Prices Before: 4414273
Prices After: 4414273


In [None]:
games.head()

Unnamed: 0,gameid,title,developers,publishers,genres,supported_languages,release_date
0,3281560,Horror Game To Play With Friends! Playtest,,,,,2024-10-21
1,3280930,Eternals' Path Playtest,,,,,2024-10-17
2,3280770,ANGST: A TALE OF SURVIVAL - Singleplayer Playtest,,,,,2024-10-13
3,3279790,Montabi Playtest,,,,,2024-10-13
4,3278320,파이팅걸 유리 Playtest,,,,,2024-10-12


# Stats

In [None]:
print(players.head())
print(games.head())
print(purchased_games.head())
print(reviews.head())
print(prices.head())

            playerid     country             created
0  76561198287452552      Brazil 2016-03-02 06:14:20
1  76561198040436563      Israel 2011-04-10 17:10:06
2  76561198049686270         NaN 2011-09-28 21:43:59
3  76561198155814250  Kazakhstan 2014-09-24 19:52:47
4  76561198119605821         NaN 2013-12-26 00:25:50
        gameid                              title             developers  \
98243  1499520                         Spiritwish   ['NEONSTUDIO Corp.']   
98244  1499540                   リアルタイムバトル将棋オンライン    ['株式会社シルバースタージャパン']   
98245  1499550  VR Luxury Life (Be a Billionaire)  ['William at Oxford']   
98246  1498590           Fat Prisoner Simulator 3              ['Kiddy']   
98247  1498600                          Neko Chan          ['Neko Game']   

                  publishers  \
98243             ['SUBETE']   
98244    ['株式会社シルバースタージャパン']   
98245  ['William at Oxford']   
98246              ['Kiddy']   
98247          ['Neko Game']   

                                

In [None]:
# General Statistics
# Function to print dataset stats
def dataset_stats(df, name):
    print(f"\n{name} Dataset")
    print("-" * 40)
    print(f"Number of Features (columns): {df.shape[1]}")
    print(f"Number of Rows: {df.shape[0]}")
    print("Column Names:", df.columns.tolist()[:10], "..." if df.shape[1] > 10 else "")

dataset_stats(players, "Players")
dataset_stats(games, "Games")
dataset_stats(purchased_games, "Purchased Games")
dataset_stats(reviews, "Reviews")
dataset_stats(prices, "Prices")


Players Dataset
----------------------------------------
Number of Features (columns): 3
Number of Rows: 424683
Column Names: ['playerid', 'country', 'created'] 

Games Dataset
----------------------------------------
Number of Features (columns): 7
Number of Rows: 98248
Column Names: ['gameid', 'title', 'developers', 'publishers', 'genres', 'supported_languages', 'release_date'] 

Purchased Games Dataset
----------------------------------------
Number of Features (columns): 2
Number of Rows: 102548
Column Names: ['playerid', 'library'] 

Reviews Dataset
----------------------------------------
Number of Features (columns): 8
Number of Rows: 1204534
Column Names: ['reviewid', 'playerid', 'gameid', 'review', 'helpful', 'funny', 'awards', 'posted'] 

Prices Dataset
----------------------------------------
Number of Features (columns): 7
Number of Rows: 4414273
Column Names: ['gameid', 'usd', 'eur', 'gbp', 'jpy', 'rub', 'date_acquired'] 


In [None]:
# Key Entities
# Unique counts from players
print("Unique Players (players.csv):", players['playerid'].nunique())
print("Unique Countries:", players['country'].nunique())

# Unique counts from games
print("Unique Games (games.csv):", games['gameid'].nunique())
print("Unique Genres:", games['genres'].nunique())
print("Unique Developers:", games['developers'].nunique())

# Purchased games
print("Rows in Purchased Games:", purchased_games.shape[0])
print("Unique Players (purchases):", purchased_games['playerid'].nunique())

# Reviews
print("Rows in Reviews:", reviews.shape[0])
print("Unique Reviewers:", reviews['playerid'].nunique())
print("Unique Games Reviewed:", reviews['gameid'].nunique())

# Prices
print("Rows in Prices:", prices.shape[0])
print("Unique Games in Prices:", prices['gameid'].nunique())

Unique Players (players.csv): 424683
Unique Countries: 249
Unique Games (games.csv): 98248
Unique Genres: 2721
Unique Developers: 57315
Rows in Purchased Games: 102548
Unique Players (purchases): 102548
Rows in Reviews: 1204534
Unique Reviewers: 196698
Unique Games Reviewed: 51910
Rows in Prices: 4414273
Unique Games in Prices: 98465


In [None]:
# Timeframe Analysis
# Convert to datetime where relevant
players['created'] = pd.to_datetime(players['created'], errors='coerce')
games['release_date'] = pd.to_datetime(games['release_date'], errors='coerce')
reviews['posted'] = pd.to_datetime(reviews['posted'], errors='coerce')
prices['date_acquired'] = pd.to_datetime(prices['date_acquired'], errors='coerce')

# Check min and max dates
print("Players created:", players['created'].min(), "to", players['created'].max())
print("Games released:", games['release_date'].min(), "to", games['release_date'].max())
print("Reviews posted:", reviews['posted'].min(), "to", reviews['posted'].max())
print("Prices acquired:", prices['date_acquired'].min(), "to", prices['date_acquired'].max())

# Reviews per year (to decide cutoff)
reviews['year'] = reviews['posted'].dt.year
print(reviews['year'].value_counts().sort_index())

Players created: 2003-09-11 08:52:38 to 2025-01-07 19:57:59
Games released: 1997-06-30 00:00:00 to 2025-01-10 00:00:00
Reviews posted: 2010-10-15 00:00:00 to 2025-01-09 00:00:00
Prices acquired: 2024-11-28 00:00:00 to 2025-02-24 00:00:00
year
2010       733
2011      3203
2012      3883
2013     14050
2014     40207
2015     51170
2016     69458
2017     83406
2018     80324
2019    108268
2020    147404
2021    140577
2022    133043
2023    144169
2024    183087
2025      1552
Name: count, dtype: int64


In [None]:
# Filter for 2020-2025
# Keep reviews from Jan 1, 2020 onwards
reviews_recent = reviews[reviews['posted'] >= "2020-01-01"]

print("Rows in Reviews (2020–2024+2025 partial):", reviews_recent.shape[0])
print("Unique Reviewers:", reviews_recent['playerid'].nunique())
print("Unique Games Reviewed:", reviews_recent['gameid'].nunique())
print("Time Frame Covered:", reviews_recent['posted'].min().date(), "to", reviews_recent['posted'].max().date())

# Count reviews per year (filtered subset)
print(reviews_recent['year'].value_counts().sort_index())

Rows in Reviews (2020–2024+2025 partial): 749832
Unique Reviewers: 139828
Unique Games Reviewed: 47074
Time Frame Covered: 2020-01-01 to 2025-01-09
year
2020    147404
2021    140577
2022    133043
2023    144169
2024    183087
2025      1552
Name: count, dtype: int64


In [None]:
def get_numerical_summary(df):
    numerical_summary = df.describe().transpose()
    numerical_summary['missing_data'] = df.isnull().sum()
    return numerical_summary

# Get numerical summaries for each dataset
players_summary = get_numerical_summary(players)
games_summary = get_numerical_summary(games)
purchased_games_summary = get_numerical_summary(purchased_games)
reviews_summary = get_numerical_summary(reviews_recent)
prices_summary = get_numerical_summary(prices)

# Display the summaries
print("Players Summary:")
print(players_summary)

print("\nGames Summary:")
print(games_summary)

print("\nPurchased Games Summary:")
print(purchased_games_summary)

print("\nReviews Summary:")
print(reviews_summary)

print("\nPrices Summary:")
print(prices_summary)

Players Summary:
             count                           mean                  min  \
playerid  424683.0            76561198352769616.0  76561197960265856.0   
created     377014  2014-12-27 04:50:35.019872256  2003-09-11 08:52:38   

                                 25%                         50%  \
playerid         76561198067731104.0         76561198188945552.0   
created   2012-10-16 16:54:08.500000  2015-04-25 10:22:53.500000   

                          75%                  max             std  \
playerid  76561198410550512.0  76561199815552640.0  399535892.5282   
created   2017-09-18 19:28:26  2025-01-07 19:57:59             NaN   

          missing_data  
playerid             0  
created          47669  

Games Summary:
                count                           mean                  min  \
gameid        98248.0                  1617185.71634                 10.0   
release_date    98248  2021-02-20 07:41:29.015552256  1997-06-30 00:00:00   

                     

In [None]:
def get_column_info(df):
    """
    Gets the data type, number of unique values, and missing value count for each column.

    Args:
        df: The input DataFrame.

    Returns:
        A DataFrame with column information.
    """
    info_dict = {}
    for col in df.columns:
        info_dict[col] = {
            'Value Type / Unit': df[col].dtype,
            'Distinct Categories': df[col].nunique(),
            'No. of Missing Data Points': df[col].isnull().sum()
        }
    return pd.DataFrame(info_dict).transpose()

# Get column information for each dataset
players_info = get_column_info(players)
games_info = get_column_info(games)
purchased_games_info = get_column_info(purchased_games)
reviews_info = get_column_info(reviews_recent)
prices_info = get_column_info(prices)

# Display the information
print("Players Column Information:")
display(players_info)

print("\nGames Column Information:")
display(games_info)

print("\nPurchased Games Column Information:")
display(purchased_games_info)

print("\nReviews Column Information:")
display(reviews_info)

print("\nPrices Column Information:")
display(prices_info)

Players Column Information:


Unnamed: 0,Value Type / Unit,Distinct Categories,No. of Missing Data Points
playerid,int64,424683,0
country,object,249,177868
created,datetime64[ns],376781,47669



Games Column Information:


Unnamed: 0,Value Type / Unit,Distinct Categories,No. of Missing Data Points
gameid,int64,98248,0
title,object,97409,3
developers,object,57315,5559
publishers,object,50753,5941
genres,object,2721,5549
supported_languages,object,14526,5506
release_date,datetime64[ns],4722,0



Purchased Games Column Information:


Unnamed: 0,Value Type / Unit,Distinct Categories,No. of Missing Data Points
playerid,int64,102548,0
library,object,46040,55607



Reviews Column Information:


Unnamed: 0,Value Type / Unit,Distinct Categories,No. of Missing Data Points
reviewid,int64,749832,0
playerid,int64,139828,0
gameid,int64,47074,0
review,object,612658,1661
helpful,int64,807,0
funny,int64,500,0
awards,int64,366,0
posted,datetime64[ns],1836,0
year,int32,6,0



Prices Column Information:


Unnamed: 0,Value Type / Unit,Distinct Categories,No. of Missing Data Points
gameid,int64,98465,0
usd,float64,1109,907148
eur,float64,1651,1703086
gbp,float64,1682,908184
jpy,float64,2615,919973
rub,float64,2377,1006291
date_acquired,datetime64[ns],45,0


In [None]:
# Filter for rows where usd is not null but other currency columns are null
usd_only_prices = prices[prices['usd'].notnull() &
                         (prices['eur'].isnull() |
                          prices['gbp'].isnull() |
                          prices['jpy'].isnull() |
                          prices['rub'].isnull())]

# Display a few examples
print("Examples of rows where only USD price is available:")
display(usd_only_prices.head())

Examples of rows where only USD price is available:


Unnamed: 0,gameid,usd,eur,gbp,jpy,rub,date_acquired
17,3266470,3.49,,3.0,406.0,140.0,2024-11-28
22,3263370,0.99,,0.89,120.0,42.0,2024-11-28
33,3260870,1.39,,1.18,164.0,57.0,2024-11-28
34,3260920,1.39,,1.18,164.0,57.0,2024-11-28
51,3255000,1.39,,1.18,164.0,57.0,2024-11-28
