# Gabriel Bertasius & Jaden Ford

# Predicting Game Success: A Regression Analysis on the Steam Games Dataset

In [46]:
import numpy as np
import pandas as pd
# show all columns
pd.set_option('display.max_columns', None)

In [57]:
# load the data into a dataframe for easy handling
import os
DATASET_DIR = './data/'
DATASET_FILENAME = 'steamgames.parquet'
DATASET_PATH = DATASET_DIR+DATASET_FILENAME
DATASET_COMPRESSION = 'zstd' # Very fast and compresses as well as gzip
download_data = 1

def check_file_exists(path:str) -> bool:
    return os.path.exists(path)   

def check_data_dir_exists()-> bool:
    return os.path.exists(DATASET_DIR)

def create_data_dir():
    directory_name = DATASET_DIR
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")
        

def download_steamgames_dataset()-> pd.DataFrame:
    df = pd.read_parquet("hf://datasets/FronkonGames/steam-games-dataset/data/train-00000-of-00001-e2ed184370a06932.parquet")
    return df


def write_dataset_pqt(df: pd.DataFrame, filename:str=DATASET_FILENAME, overwrite:bool=False) -> bool:
    dir = DATASET_DIR 
    path = dir+filename
    if(check_data_dir_exists() == False):
        create_data_dir()
    if check_file_exists(path) and overwrite == False:
        print("File exists. Pass 'overwrite' to replace.")
        return False
    else:
        df.to_parquet(path, compression='zstd')
        return True

def read_dataset_pqt(filename:str=DATASET_FILENAME):
    path = DATASET_DIR+filename
    if check_file_exists(path):
        return pd.read_parquet(path)
    else:
        print("Parquet file not found.")

def download_and_save_dataset(force: bool=False, filename:str=DATASET_FILENAME)-> pd.DataFrame|None:
    dir = DATASET_DIR
    path = dir+filename
    if(check_file_exists(path)):
        if(force == False):
            print(f"❌ Dataset exists locally. Path: {path}")
            print("Use force=True to download and overwrite.")
            return None
        else:
            print(f"⚠️ Dataset exists locally. Path: {path}")
            print("Redownloading and Overwriting...")
    else:
        print(f"Downloading and saving dataset to {path} ")
    df = download_steamgames_dataset()
    write_dataset_pqt(df, overwrite=True)
    print("✅ Done.")
    print(f"Saved to: {path}")
    return df


df=download_and_save_dataset(force=False)
if(df is not None):
    df=read_dataset_pqt()


❌ Dataset exists locally. Path: ./data/steamgames.parquet
Use force=True to download and overwrite.


In [48]:
# Check for any missing values
df.isnull().sum()

AppID                             0
Name                              6
Release date                      0
Estimated owners                  0
Peak CCU                          0
Required age                      0
Price                             0
DLC count                         0
About the game                 3437
Supported languages               0
Full audio languages              0
Reviews                       73844
Header image                      0
Website                       44506
Support url                   42544
Support email                 13319
Windows                           0
Mac                               0
Linux                             0
Metacritic score                  0
Metacritic url                79650
User score                        0
Positive                          0
Negative                          0
Score rank                    83516
Achievements                      0
Recommendations                   0
Notes                       

In [49]:
# If we want to remove rows that have no reviews, we would have 9716 examples
#df = df.dropna(axis=0, subset='Reviews')
#print(df.shape[0])
#df.isnull().sum()

In [50]:
# remove any columns that won't contribute to a game's success rating
cols_to_remove = ['About the game', 'Supported languages', 'Full audio languages',
                  'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url',
                  'Score rank', 'Screenshots', 'Movies']
df = df.drop(columns=cols_to_remove, axis=1)
df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,,True,False,False,0,0,6,11,30,0,,0,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling"
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,,True,True,False,0,0,53,5,12,0,,0,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,,True,False,False,0,0,0,0,0,0,,0,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,,True,True,True,0,0,3,0,0,0,,0,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,,True,True,False,0,0,50,8,17,0,This Game may contain content not appropriate ...,0,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,..."


In [51]:
# function that calculates the number of years since its release
from datetime import datetime
def years_since_release(date_string):
  if len(date_string) == 11 or len(date_string) == 12:
        date = datetime.strptime(date_string, "%b %d, %Y")
  else: # length must be 8 or 9
      date = datetime.strptime(date_string, "%b %Y")

  current_date = datetime.now()
  years = (current_date - date).days / 365
  return years

# function to return avg of the estimated owners
def est_owners(num_owners):
  numbers = num_owners.split('-')
  return (int(numbers[0]) + int(numbers[1])) / 2

# function to normalize a numerical column between 0-1 based on min and and max values
def min_max_normalize(column):
  column = np.array(column)
  norm_col = ( column - np.min(column) ) / ( np.max(column) - np.min(column) )
  return norm_col


In [52]:
# 

In [53]:
# convert release date to years since release
df['Release date'] = df['Release date'].apply(years_since_release)

# return middle value for each given range of estimated owners
df['Estimated owners'] = df['Estimated owners'].apply(est_owners)

# convert windows, mac, and linux columns from boolean to integer
df['Windows'] = df['Windows'].astype(int)
df['Mac'] = df['Mac'].astype(int)
df['Linux'] = df['Linux'].astype(int)

# normalize any large value ranges
cols_to_normalize = ['Release date', 'Estimated owners', 'Peak CCU', 'Required age', 'Price', 'DLC count',
                     'Metacritic score', 'User score', 'Positive', 'Negative', 'Achievements',
                     'Recommendations', 'Average playtime forever', 'Average playtime two weeks',
                     'Median playtime forever', 'Median playtime two weeks']
for col in cols_to_normalize:
  df[col] = min_max_normalize(df[col])

In [54]:
df.head(n=10)

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
0,20200,Galactic Bowling,0.593005,6.7e-05,0.0,0.0,0.01999,0.0,,1,0,0,0.0,0.0,1.040868e-06,1.2e-05,0.003055,0.0,,0.0,0.0,0.0,0.0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling"
1,655370,Train Bandit,0.270049,6.7e-05,0.0,0.0,0.00099,0.0,,1,1,0,0.0,0.0,9.194334e-06,6e-06,0.001222,0.0,,0.0,0.0,0.0,0.0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1732930,Jolt Project,0.122562,6.7e-05,0.0,0.0,0.00499,0.0,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",
3,1355720,Henosis™,0.170049,6.7e-05,0.0,0.0,0.00599,0.0,,1,1,1,0.0,0.0,5.20434e-07,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,1139950,Two Weeks in Painland,0.186897,6.7e-05,0.0,0.0,0.0,0.0,,1,1,0,0.0,0.0,8.6739e-06,9e-06,0.001731,0.0,This Game may contain content not appropriate ...,0.0,0.0,0.0,0.0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,..."
5,1469160,Wartune Reborn,0.148571,0.0005,7.8e-05,0.0,0.0,0.0,,1,0,0,0.0,0.0,1.509259e-05,5.5e-05,0.0,0.0,,0.0,0.0,0.0,0.0,7Road,7Road,"Single-player,Multi-player,MMO,PvP,Online PvP,...","Adventure,Casual,Free to Play,Massively Multip...","Turn-Based Combat,Massively Multiplayer,Multip..."
6,1659180,TD Worlds,0.11734,6.7e-05,3e-06,0.0,0.01099,0.000423,,1,0,0,0.0,0.0,3.643038e-06,8e-06,0.006313,0.0,,0.0,0.0,0.0,0.0,MAKSIM VOLKAU,MAKSIM VOLKAU,"Single-player,Steam Achievements,Steam Cloud","Indie,Strategy","Tower Defense,Rogue-lite,RTS,Replay Value,Perm..."
7,1968760,Legend of Rome - The Wrath of Mars,0.105911,6.7e-05,2e-06,0.0,0.00999,0.0,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,magnussoft,magnussoft,"Single-player,Steam Cloud",Casual,
8,1178150,MazM: Jekyll and Hyde,0.181084,6.7e-05,1e-06,0.0,0.01499,0.0,,1,0,0,0.0,0.0,1.318433e-05,7e-06,0.002546,0.0,,0.0,0.0,0.0,0.0,Growing Seeds,"CFK Co., Ltd.","Single-player,Steam Achievements,Full controll...","Adventure,RPG,Simulation,Strategy","Adventure,Simulation,RPG,Strategy,Singleplayer..."
9,320150,Deadlings: Rotten Edition,0.375074,0.0005,0.0,0.0,0.00399,0.0,,1,1,1,0.0,0.0,3.903255e-05,5e-05,0.003258,0.0,,0.004824,0.0,0.003751,0.0,ONE MORE LEVEL,ONE MORE LEVEL,"Single-player,Steam Achievements,Steam Trading...","Action,Adventure,Indie","Action,Indie,Adventure,Puzzle-Platformer,Arcad..."


In [55]:
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'Reviews', 'Windows', 'Mac',
       'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative',
       'Achievements', 'Recommendations', 'Notes', 'Average playtime forever',
       'Average playtime two weeks', 'Median playtime forever',
       'Median playtime two weeks', 'Developers', 'Publishers', 'Categories',
       'Genres', 'Tags'],
      dtype='object')