# Gabriel Bertasius & Jaden Ford#

# Predicting Game Success: A Regression Analysis on the Steam Games Dataset #

In [1]:
import numpy as np
import pandas as pd
# show all columns
pd.set_option('display.max_columns', None)

## Downloading and loading data

In [2]:
# load the data into a dataframe for easy handling
import os
DATASET_DIR = './data/'
DATASET_FILENAME = 'steamgames.parquet'
DATASET_PATH = DATASET_DIR+DATASET_FILENAME
DATASET_COMPRESSION = 'zstd'  # Very fast and compresses as well as gzip
download_data = 1


def check_file_exists(path: str) -> bool:
    return os.path.exists(path)


def check_data_dir_exists() -> bool:
    return os.path.exists(DATASET_DIR)


def create_data_dir():
    directory_name = DATASET_DIR
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")


def download_steamgames_dataset() -> pd.DataFrame:
    df = pd.read_parquet(
        "hf://datasets/FronkonGames/steam-games-dataset/data/train-00000-of-00001-e2ed184370a06932.parquet")
    return df


def write_dataset_pqt(df: pd.DataFrame, filename: str = DATASET_FILENAME, overwrite: bool = False) -> bool:
    dir = DATASET_DIR
    path = dir+filename
    if (check_data_dir_exists() == False):
        create_data_dir()
    if check_file_exists(path) and overwrite == False:
        print("File exists. Pass 'overwrite' to replace.")
        return False
    else:
        df.to_parquet(path, compression='zstd')
        return True


def read_dataset_pqt(filename: str = DATASET_FILENAME):
    path = DATASET_DIR+filename
    if check_file_exists(path):
        print("Loading dataset from local storage...")
        prq = pd.read_parquet(path)
        print("✅ Local dataset loaded.")
        return prq
    else:
        print("Parquet file not found.")


def download_and_save_dataset(force: bool = False, filename: str = DATASET_FILENAME) -> pd.DataFrame | None:
    dir = DATASET_DIR
    path = dir+filename
    if (check_file_exists(path)):
        print(f"⚠️ Dataset exists locally. Path:{path}")
        if (force == False):
            print("Use force=True to download and overwrite.")
            return None
        else:
            print("Redownloading and Overwriting...")
    else:
        print(f"Downloading and saving dataset to {path} ")
    df = download_steamgames_dataset()
    write_dataset_pqt(df, overwrite=True)
    print("✅ Done.")
    print(f"Saved to: {path}")
    return df


df = download_and_save_dataset(force=False)
if(df is None):
    df = read_dataset_pqt()

⚠️ Dataset exists locally. Path:./data/steamgames.parquet
Use force=True to download and overwrite.
Loading dataset from local storage...
✅ Local dataset loaded.


In [3]:
# Check for any missing values
df.isnull().sum()

AppID                             0
Name                              6
Release date                      0
Estimated owners                  0
Peak CCU                          0
Required age                      0
Price                             0
DLC count                         0
About the game                 3437
Supported languages               0
Full audio languages              0
Reviews                       73844
Header image                      0
Website                       44506
Support url                   42544
Support email                 13319
Windows                           0
Mac                               0
Linux                             0
Metacritic score                  0
Metacritic url                79650
User score                        0
Positive                          0
Negative                          0
Score rank                    83516
Achievements                      0
Recommendations                   0
Notes                       

In [4]:
# remove any columns that won't contribute to a game's success rating
cols_to_remove = ['About the game', 'Supported languages', 'Full audio languages',
                  'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url',
                  'Score rank', 'Screenshots', 'Movies']
df = df.drop(columns=cols_to_remove, axis=1)
df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,,True,False,False,0,0,6,11,30,0,,0,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling"
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,,True,True,False,0,0,53,5,12,0,,0,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,,True,False,False,0,0,0,0,0,0,,0,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,,True,True,True,0,0,3,0,0,0,,0,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,,True,True,False,0,0,50,8,17,0,This Game may contain content not appropriate ...,0,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,..."


In [5]:
# function that calculates the number of years since a game's release date
from datetime import datetime
def years_since_release(date_string):
  if len(date_string) == 11 or len(date_string) == 12:
        date = datetime.strptime(date_string, "%b %d, %Y")
  else: # length must be 8 or 9
      date = datetime.strptime(date_string, "%b %Y")

  current_date = datetime.now()
  years = (current_date - date).days / 365
  return years

# function to return the avg number of estimated owners
def est_owners(num_owners):
  numbers = num_owners.split('-')
  return (int(numbers[0]) + int(numbers[1])) / 2

# function to normalize a numerical column between 0-1 based on min and and max values
def min_max_normalize(column):
  column = np.array(column)
  norm_col = ( column - np.min(column) ) / ( np.max(column) - np.min(column) )
  return norm_col

In [6]:
# convert release date to years since release
df['Release date'] = df['Release date'].apply(years_since_release)

# return middle value for each given range of estimated owners
df['Estimated owners'] = df['Estimated owners'].apply(est_owners)

# convert windows, mac, and linux columns from boolean to integer
df['Windows'] = df['Windows'].astype(int)
df['Mac'] = df['Mac'].astype(int)
df['Linux'] = df['Linux'].astype(int)

In [7]:
# Filter out any games that are free, have no peak ccu, and no estimated owners
# This allows us to judge success based on games that competed in certain markets, and have had actual people play them
no_peak_ccu_cols = df[df['Peak CCU'] == 0].index
df = df.drop(no_peak_ccu_cols, axis=0)

no_est_owners_cols = df[df['Estimated owners'] == 0].index
df = df.drop(no_est_owners_cols, axis=0)

no_price_cols = df[df['Price'] == 0].index
df = df.drop(no_price_cols, axis=0)

In [8]:
# normalize any large value ranges
cols_to_normalize = ['Release date', 'Estimated owners', 'Peak CCU', 'Required age', 'Price', 'DLC count',
                     'Metacritic score', 'User score', 'Positive', 'Negative', 'Achievements',
                     'Recommendations', 'Average playtime forever', 'Average playtime two weeks',
                     'Median playtime forever', 'Median playtime two weeks']
for col in cols_to_normalize:
  df[col] = min_max_normalize(df[col])

In [9]:
# If we want to remove rows that have no reviews, we would have 4269 examples
#df = df.dropna(axis=0, subset='Reviews')
#print(df.shape[0])
#df.isnull().sum()

In [None]:
print(df.shape)
df.head(n=20)

(20194, 28)


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
6,1659180,TD Worlds,0.071606,0.0,2e-06,0.0,0.03946,0.000423,,1,0,0,0.0,0.0,2.2e-05,5.1e-05,0.006313,0.0,,0.0,0.0,0.0,0.0,MAKSIM VOLKAU,MAKSIM VOLKAU,"Single-player,Steam Achievements,Steam Cloud","Indie,Strategy","Tower Defense,Rogue-lite,RTS,Replay Value,Perm..."
7,1968760,Legend of Rome - The Wrath of Mars,0.059585,0.0,1e-06,0.0,0.035751,0.0,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,magnussoft,magnussoft,"Single-player,Steam Cloud",Casual,
8,1178150,MazM: Jekyll and Hyde,0.138653,0.0,0.0,0.0,0.054295,0.0,,1,0,0,0.0,0.0,7.9e-05,4.3e-05,0.002546,0.0,,0.0,0.0,0.0,0.0,Growing Seeds,"CFK Co., Ltd.","Single-player,Steam Achievements,Full controll...","Adventure,RPG,Simulation,Strategy","Adventure,Simulation,RPG,Strategy,Singleplayer..."
10,1026420,WARSAW,0.157617,0.000333,5e-06,0.0,0.087672,0.0,“New WW2 Strategy Game Offers A Harrowing Look...,1,0,0,0.639175,0.0,0.00061,0.00153,0.003462,0.000475,,0.00046,0.0,0.000446,0.0,Pixelated Milk,"Pixelated Milk,gaming company","Single-player,Steam Achievements,Steam Trading...","Indie,RPG","Tactical RPG,Turn-Based Strategy,Wargame,Histo..."
14,1454010,Diary of Lucie,0.114093,0.0,2e-06,0.0,0.046877,0.0,,1,0,0,0.0,0.0,0.000104,3.6e-05,0.0,0.0,,0.0,0.0,0.0,0.0,Parange Project,Parange Project,"Single-player,Partial Controller Support,Steam...","Action,Adventure,Indie,RPG,Strategy,Early Access","Action Roguelike,Action,Rogue-lite,RPGMaker,My..."
15,22670,Alien Breed 3: Descent,0.493472,0.004534,2e-06,0.0,0.035751,0.0,,1,0,0,0.659794,0.0,0.000362,0.000967,0.001324,0.000317,,0.000302,0.0,0.000115,0.0,Team17 Digital Ltd,Team17 Digital Ltd,"Single-player,Multi-player,Co-op,Steam Achieve...",Action,"Action,Shooter,Sci-fi,Aliens,Third Person,Isom..."
17,346560,Hero of the Kingdom II,0.332228,0.001867,5e-06,0.0,0.028334,0.0,“Hero of the Kingdom II is a title that casual...,1,1,1,0.0,0.0,0.00212,0.000866,0.004887,0.001795,,0.001921,0.0,0.001453,0.0,Lonely Troops,Lonely Troops,"Single-player,Steam Achievements,Steam Trading...","Adventure,Casual,Indie,RPG","Adventure,Casual,Point & Click,RPG,Indie,Isome..."
18,1097880,Super Naughty Maid 2,0.168394,0.0,3e-06,0.0,0.034268,0.000845,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.001272,This game depicts sexual acts between the play...,0.000233,0.0,0.000144,0.0,KENZsoft,Denpasoft,"Single-player,Steam Trading Cards","Casual,Indie",
21,575760,Project: R.E.B.O.O.T 2,0.258964,0.000333,1e-05,0.0,0.009791,0.0,,1,0,0,0.0,0.0,3e-05,0.000202,0.0,0.0,,0.002313,0.0,0.001386,0.0,Volens Nolens Games,Volens Nolens Games,"Single-player,Steam Trading Cards","Action,Adventure,Casual,Indie","Action,Adventure,Casual,Indie,Platformer,2D Pl..."
22,434030,Aerofly FS 2 Flight Simulator,0.228187,0.001867,2.1e-05,0.0,0.137739,0.009721,,1,1,1,0.0,0.0,0.001544,0.002945,0.0,0.002036,,0.000871,0.0,0.000609,0.0,IPACS,IPACS,"Single-player,Partial Controller Support","Action,Indie,Racing,Simulation","Flight,Simulation,VR,Racing,Physics,Open World..."


### Counting unique words in Categories, Genres, Tags

In [None]:
df.columns
df['Tags']

def count_unique(df, label:str):
    lists:pd.Series= df['Tags'].str.casefold().str.split(',')
    words = set()
    [words.update(x) for x in lists if x is not None]
    print(f"Number of unique {label}: {len(words)}")
    return words

count_unique(df, 'Categories')
count_unique(df, 'Genres')
count_unique(df, 'Tags')
;

Number of unique Categories: 444
Number of unique Genres: 444
Number of unique Tags: 444


{'1980s',
 "1990's",
 '2.5d',
 '2d',
 '2d fighter',
 '2d platformer',
 '360 video',
 '3d',
 '3d fighter',
 '3d platformer',
 '3d vision',
 '4 player local',
 '4x',
 '6dof',
 '8-bit music',
 'abstract',
 'action',
 'action roguelike',
 'action rpg',
 'action rts',
 'action-adventure',
 'addictive',
 'adventure',
 'agriculture',
 'aliens',
 'alternate history',
 'ambient',
 'america',
 'animation & modeling',
 'anime',
 'arcade',
 'archery',
 'arena shooter',
 'artificial intelligence',
 'assassin',
 'asymmetric vr',
 'asynchronous multiplayer',
 'atmospheric',
 'atv',
 'audio production',
 'auto battler',
 'automation',
 'automobile sim',
 'base-building',
 'baseball',
 'based on a novel',
 'basketball',
 'battle royale',
 "beat 'em up",
 'beautiful',
 'benchmark',
 'bikes',
 'birds',
 'blood',
 'bmx',
 'board game',
 'boss rush',
 'bowling',
 'boxing',
 'building',
 'bullet hell',
 'bullet time',
 'capitalism',
 'card battler',
 'card game',
 'cartoon',
 'cartoony',
 'casual',
 'cats',