# Gabriel Bertasius & Jaden Ford#

# Predicting Game Success: A Regression Analysis on the Steam Games Dataset #

In [None]:
import numpy as np
import pandas as pd
# show all columns
pd.set_option('display.max_columns', None)

## Downloading and loading data

In [None]:
# load the data into a dataframe for easy handling
import os
from datetime import datetime
import pickle
import gzip
DATASET_DIR = './data/'
DATASET_FILENAME = 'steamgames.parquet'
DATASET_PATH = DATASET_DIR+DATASET_FILENAME
DATASET_COMPRESSION = 'zstd'  # Very fast and compresses as well as gzip
MODELS_DIR = './models/'
MODELS_FILENAME = 'model-'
download_data = 1


def check_file_exists(path: str) -> bool:
    return os.path.exists(path)


def check_data_dir_exists() -> bool:
    return os.path.exists(DATASET_DIR)

def check_models_dir_exists() -> bool:
    return os.path.exists(MODELS_DIR)

def create_data_dir():
    directory_name = DATASET_DIR
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def create_models_dir():
    directory_name = MODELS_DIR
    try:
        os.mkdir(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def download_steamgames_dataset() -> pd.DataFrame:
    df = pd.read_parquet(
        "hf://datasets/FronkonGames/steam-games-dataset/data/train-00000-of-00001-e2ed184370a06932.parquet")
    return df


def write_dataset_pqt(df: pd.DataFrame, filename: str = DATASET_FILENAME, overwrite: bool = False) -> bool:
    dir = DATASET_DIR
    path = dir+filename
    if (check_data_dir_exists() == False):
        create_data_dir()
    if check_file_exists(path) and overwrite == False:
        print("File exists. Pass 'overwrite' to replace.")
        return False
    else:
        df.to_parquet(path, compression='zstd')
        return True


def read_dataset_pqt(filename: str = DATASET_FILENAME):
    path = DATASET_DIR+filename
    if check_file_exists(path):
        print("Loading dataset from local storage...")
        prq = pd.read_parquet(path)
        print("✅ Local dataset loaded.")
        return prq
    else:
        print("Parquet file not found.")

def datestamp():
    """ Get the current datestamp """
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def write_model_log(path:str, config: dict, **kwargs):
    with open(path+".txt", "a") as file:
        file.write(f"[{datestamp()}]\n")
        file.write(f"{config}\n")
        if kwargs:
            for x in kwargs:
                file.write(f"{x}\n")

def pickle_model(filename: str, model, params_dict:dict, param_grid:dict = None,overwrite: bool=False, **extra_data):
    dir = MODELS_DIR
    path = dir+filename
    for s in params_dict.values():
        path += f'-{s}'
    if (check_models_dir_exists() == False):
        create_models_dir()
    if check_file_exists(path) and overwrite == False:
        print("File exists. Pass 'overwrite' to replace.")
        return False
    else:
        if param_grid is not None:
            write_model_log(path, param_grid, **extra_data)
        else:
            write_model_log(path, params_dict)
        level = 7   # Good balance between speed and compression
        with gzip.open(path+".pkl.gz", "wb", compresslevel=level) as file:
            pickle.dump(model, file, protocol=5)
        return True

def unpickle_model(filename):
    path = MODELS_DIR+filename
    with gzip.open(path+".pkl.gz", "rb") as file:
        return pickle.load(file)

def download_and_save_dataset(force: bool = False, filename: str = DATASET_FILENAME) -> pd.DataFrame | None:
    dir = DATASET_DIR
    path = dir+filename
    if (check_file_exists(path)):
        print(f"⚠️ Dataset exists locally. Path:{path}")
        if (force == False):
            print("Use force=True to download and overwrite.")
            return None
        else:
            print("Redownloading and Overwriting...")
    else:
        print(f"Downloading and saving dataset to {path} ")
    df = download_steamgames_dataset()
    write_dataset_pqt(df, overwrite=True)
    print("✅ Done.")
    print(f"Saved to: {path}")
    return df


df = download_and_save_dataset(force=False)
if(df is None):
    df = read_dataset_pqt()

⚠️ Dataset exists locally. Path:./data/steamgames.parquet
Use force=True to download and overwrite.
Loading dataset from local storage...
✅ Local dataset loaded.


In [None]:
# Check for any missing values
sum = df.isnull().sum()
sum[sum != 0]

Unnamed: 0,0
Name,6
About the game,3437
Reviews,73844
Website,44506
Support url,42544
Support email,13319
Metacritic url,79650
Score rank,83516
Notes,70845
Developers,3457


In [None]:
# remove any columns that won't contribute to a game's success rating
cols_to_remove = ['About the game', 'Supported languages', 'Full audio languages',
                  'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url',
                  'Score rank', 'Screenshots', 'Movies']
df = df.drop(columns=cols_to_remove, axis=1)
df.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,,True,False,False,0,0,6,11,30,0,,0,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling"
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,,True,True,False,0,0,53,5,12,0,,0,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,,True,False,False,0,0,0,0,0,0,,0,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,,True,True,True,0,0,3,0,0,0,,0,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,,True,True,False,0,0,50,8,17,0,This Game may contain content not appropriate ...,0,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,..."


In [None]:
from dataclasses import dataclass, fields, field

@dataclass
class DataMinMax:
    data:dict = field(default_factory=dict)


In [None]:
# function that calculates the number of years since a game's release date
from datetime import datetime
def years_since_release(date_string):
  if len(date_string) == 11 or len(date_string) == 12:
        date = datetime.strptime(date_string, "%b %d, %Y")
  else: # length must be 8 or 9
      date = datetime.strptime(date_string, "%b %Y")

  current_date = datetime.now()
  years = (current_date - date).days / 365
  return years

# function to return the avg number of estimated owners
def est_owners(num_owners):
  numbers = num_owners.split('-')
  return (int(numbers[0]) + int(numbers[1])) / 2

# function to normalize a numerical column between 0-1 based on min and and max values
def min_max_normalize(column):
  column = np.array(column)
  norm_col = ( column - np.min(column) ) / ( np.max(column) - np.min(column) )
  return norm_col

In [None]:
# convert release date to years since release
df['Release date'] = df['Release date'].apply(years_since_release)

# return middle value for each given range of estimated owners
df['Estimated owners'] = df['Estimated owners'].apply(est_owners)

# convert windows, mac, and linux columns from boolean to integer
df['Windows'] = df['Windows'].astype(int)
df['Mac'] = df['Mac'].astype(int)
df['Linux'] = df['Linux'].astype(int)

In [None]:
# Filter out any games that are free, have no peak ccu, and no estimated owners
# This allows us to judge success based on games that competed in certain markets, and have had actual people play them
no_peak_ccu_cols = df[df['Peak CCU'] == 0].index
df = df.drop(no_peak_ccu_cols, axis=0)

no_est_owners_cols = df[df['Estimated owners'] == 0].index
df = df.drop(no_est_owners_cols, axis=0)

no_price_cols = df[df['Price'] == 0].index
df = df.drop(no_price_cols, axis=0)

In [None]:
# keep a copy of pre_normalized values

df_orig = df.copy(deep=True)

In [None]:
# normalize any large value ranges
cols_to_normalize = ['Release date', 'Estimated owners', 'Peak CCU', 'Required age', 'Price', 'DLC count',
                     'Metacritic score', 'User score', 'Positive', 'Negative', 'Achievements',
                     'Recommendations', 'Average playtime forever', 'Average playtime two weeks',
                     'Median playtime forever', 'Median playtime two weeks']
for col in cols_to_normalize:
  df[col] = min_max_normalize(df[col])

In [None]:
# If we want to remove rows that have no reviews, we would have 4269 examples
#df = df.dropna(axis=0, subset='Reviews')
#print(df.shape[0])
#df.isnull().sum()

In [None]:
print(df.shape)
df.head()

(20194, 28)


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags
6,1659180,TD Worlds,0.071606,0.0,2e-06,0.0,0.03946,0.000423,,1,0,0,0.0,0.0,2.2e-05,5.1e-05,0.006313,0.0,,0.0,0.0,0.0,0.0,MAKSIM VOLKAU,MAKSIM VOLKAU,"Single-player,Steam Achievements,Steam Cloud","Indie,Strategy","Tower Defense,Rogue-lite,RTS,Replay Value,Perm..."
7,1968760,Legend of Rome - The Wrath of Mars,0.059585,0.0,1e-06,0.0,0.035751,0.0,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,magnussoft,magnussoft,"Single-player,Steam Cloud",Casual,
8,1178150,MazM: Jekyll and Hyde,0.138653,0.0,0.0,0.0,0.054295,0.0,,1,0,0,0.0,0.0,7.9e-05,4.3e-05,0.002546,0.0,,0.0,0.0,0.0,0.0,Growing Seeds,"CFK Co., Ltd.","Single-player,Steam Achievements,Full controll...","Adventure,RPG,Simulation,Strategy","Adventure,Simulation,RPG,Strategy,Singleplayer..."
10,1026420,WARSAW,0.157617,0.000333,5e-06,0.0,0.087672,0.0,“New WW2 Strategy Game Offers A Harrowing Look...,1,0,0,0.639175,0.0,0.00061,0.00153,0.003462,0.000475,,0.00046,0.0,0.000446,0.0,Pixelated Milk,"Pixelated Milk,gaming company","Single-player,Steam Achievements,Steam Trading...","Indie,RPG","Tactical RPG,Turn-Based Strategy,Wargame,Histo..."
14,1454010,Diary of Lucie,0.114093,0.0,2e-06,0.0,0.046877,0.0,,1,0,0,0.0,0.0,0.000104,3.6e-05,0.0,0.0,,0.0,0.0,0.0,0.0,Parange Project,Parange Project,"Single-player,Partial Controller Support,Steam...","Action,Adventure,Indie,RPG,Strategy,Early Access","Action Roguelike,Action,Rogue-lite,RPGMaker,My..."


### Counting unique words in Categories, Genres, Tags

'Dumb counting' as in the tags 'turn-based' and 'turn-based combat' or 'turn-based strategy' are different words. These should be ok for word2vec as they're similar.

In [None]:
df.columns
df['Tags']

def count_unique_words(df, label:str):
    lists:pd.Series= df[label].str.casefold().str.split(',')
    words = set()
    [words.update(x) for x in lists if x is not None]
    print(f"Number of unique {label}: {len(words)}")
    return words

count_unique_words(df, 'Categories') # 39
count_unique_words(df, 'Genres') # 27
count_unique_words(df, 'Tags') # 444
;

Number of unique Categories: 39
Number of unique Genres: 27
Number of unique Tags: 444


''

In [None]:
encoded_categories = df['Categories'].str.get_dummies(sep=',')
encoded_genres = df['Genres'].str.get_dummies(sep=',')

df = pd.concat([df, encoded_categories, encoded_genres], axis=1)
df = df.drop(columns=['Categories', 'Genres'], axis=1)

In [None]:
print(df.shape)
df.head()

(20194, 92)


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,Reviews,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Achievements,Recommendations,Notes,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Tags,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Full controller support,HDR available,In-App Purchases,Includes Source SDK,Includes level editor,LAN Co-op,LAN PvP,MMO,Multi-player,Online Co-op,Online PvP,Partial Controller Support,PvP,Remote Play Together,Remote Play on Phone,Remote Play on TV,Remote Play on Tablet,Shared/Split Screen,Shared/Split Screen Co-op,Shared/Split Screen PvP,Single-player,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Controller Support,Tracked Motion Controller Support,VR Only,VR Support,VR Supported,Valve Anti-Cheat enabled,Accounting,Action,Adventure,Animation & Modeling,Audio Production,Casual,Design & Illustration,Early Access,Education,Free to Play,Game Development,Gore,Indie,Massively Multiplayer,Nudity,Photo Editing,RPG,Racing,Sexual Content,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Violent,Web Publishing
6,1659180,TD Worlds,0.071606,0.0,2e-06,0.0,0.03946,0.000423,,1,0,0,0.0,0.0,2.2e-05,5.1e-05,0.006313,0.0,,0.0,0.0,0.0,0.0,MAKSIM VOLKAU,MAKSIM VOLKAU,"Tower Defense,Rogue-lite,RTS,Replay Value,Perm...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
7,1968760,Legend of Rome - The Wrath of Mars,0.059585,0.0,1e-06,0.0,0.035751,0.0,,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,magnussoft,magnussoft,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1178150,MazM: Jekyll and Hyde,0.138653,0.0,0.0,0.0,0.054295,0.0,,1,0,0,0.0,0.0,7.9e-05,4.3e-05,0.002546,0.0,,0.0,0.0,0.0,0.0,Growing Seeds,"CFK Co., Ltd.","Adventure,Simulation,RPG,Strategy,Singleplayer...",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
10,1026420,WARSAW,0.157617,0.000333,5e-06,0.0,0.087672,0.0,“New WW2 Strategy Game Offers A Harrowing Look...,1,0,0,0.639175,0.0,0.00061,0.00153,0.003462,0.000475,,0.00046,0.0,0.000446,0.0,Pixelated Milk,"Pixelated Milk,gaming company","Tactical RPG,Turn-Based Strategy,Wargame,Histo...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
14,1454010,Diary of Lucie,0.114093,0.0,2e-06,0.0,0.046877,0.0,,1,0,0,0.0,0.0,0.000104,3.6e-05,0.0,0.0,,0.0,0.0,0.0,0.0,Parange Project,Parange Project,"Action Roguelike,Action,Rogue-lite,RPGMaker,My...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0


### Word2Vec embedding for Tags feature

Currently the embedding for the tags is an average of the tags for a given game. This results in d-dimensional feature embedding where d is the numer of dimensions specified in word2vec training.

todo: process hyphenated and multi-word tags. Treat as one phrase by subbing dashes and spaces with an underline

todo: tuning: what do the parameters do? what can be tweaked? what is desired?

todo: CBOW vs CSkipGram

In [None]:
label = 'Tags'
lists:pd.Series= df[label].str.casefold().str.split(',')
# lists.fillna('none')
lists = lists.apply(lambda x: ['none'] if x is None else x)
sentences = [x for x in lists]

In [None]:
print(sentences[9])

['flight', 'simulation', 'vr', 'racing', 'physics', 'open world', 'realistic', 'education', 'exploration', 'jet', '3d vision', 'relaxing', '3d', 'level editor', 'america', 'singleplayer', 'trackir', 'early access', 'indie', 'multiplayer']


In [None]:
import gensim

model_name = "100features_1minwords_10context"
model = None

num_features = 100  # Word vector dimensionality
min_word_count = 1  # Minimum word count
num_workers = 8  # Number of threads to run in parallel
context = 10  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words

def init_sims(model):
    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    print("get_mean_vector is deprecated. Use get_vector(key, norm=True) instead")
    model.init_sims(replace=True)


if check_file_exists(model_name):
    """Normalize Vectors"""
    print("Loading saved model")
    model = gensim.models.Word2Vec.load(model_name)
    init_sims(model)

else:

    # def process_tags(df: pd.DataFrame):
    # model = gensim.models.Word2Vec

    # Code from:

    # https://www.kaggle.com/competitions/word2vec-nlp-tutorial/overview
    # Set values for various parameters

    print("Training model...")
    model = gensim.models.Word2Vec(
        sentences,
        workers=num_workers,
        vector_size=num_features,
        min_count=min_word_count,
        window=context,
        sample=downsampling,
    )

    # It can be helpful to create a meaningful model name and
    # save the model for later use. You can load it later using Word2Vec.load()
    model.save(model_name)

  model.init_sims(replace=True)


Loading saved model
get_mean_vector is deprecated. Use get_vector(key, norm=True) instead


Vector of the tag 'singleplayer'

In [None]:
print(len(model.wv.index_to_key))
print(model.wv.index_to_key[3])
model.wv['action']

445
action


array([-0.08338251, -0.00371678,  0.03575636, -0.05057885,  0.03645275,
       -0.15554576, -0.03281867,  0.22458294,  0.1551109 ,  0.02699728,
        0.06677747,  0.07160099, -0.12958173, -0.04534213,  0.05806217,
       -0.0412467 ,  0.0037306 ,  0.02466516, -0.05094792, -0.0162768 ,
        0.01172866,  0.09708922, -0.01235992,  0.16073892, -0.10226272,
       -0.02056089,  0.05618764,  0.05804021,  0.06256554,  0.07002434,
       -0.05583004, -0.08311781, -0.02151276,  0.03804636,  0.00596616,
        0.22999442,  0.06253067, -0.20352   , -0.00859065, -0.01577525,
       -0.01758286, -0.09250802,  0.16155018,  0.09991846,  0.08035807,
       -0.06930835, -0.04936717, -0.00643551, -0.00752836,  0.05482354,
        0.22482835, -0.01526127, -0.0151998 , -0.06117424, -0.13056271,
        0.07665824,  0.07565914,  0.12799568,  0.00774388,  0.00359856,
       -0.03507788,  0.05250268,  0.06011106, -0.06072224,  0.22354864,
       -0.17595893, -0.09207407,  0.03027374, -0.0574421 , -0.25

Word relative rank (cosine similarity)

In [None]:
model.wv.most_similar('action', topn=10)
model.wv.similar_by_word('action', topn=10) # same result

[('looter shooter', 0.5400166511535645),
 ('shooter', 0.5364624261856079),
 ('arena shooter', 0.5134661793708801),
 ('third-person shooter', 0.5089223384857178),
 ('blood', 0.4981051981449127),
 ('hero shooter', 0.4896371364593506),
 ('parkour', 0.48606041073799133),
 ('spectacle fighter', 0.4774548411369324),
 ('ninja', 0.4746415317058563),
 ('bullet time', 0.4740895628929138)]

In [None]:
type(model)
words = model.wv.index_to_key
words[0:10]
model.wv.most_similar('none') # this needs fixin

[('simulation', 0.13729743659496307),
 ('singleplayer', 0.12166914343833923),
 ('immersive sim', 0.09767644852399826),
 ('realistic', 0.09087114036083221),
 ('6dof', 0.08622349798679352),
 ('relaxing', 0.08355166018009186),
 ('3d vision', 0.0698079839348793),
 ('sailing', 0.06809101998806),
 ('walking simulator', 0.06718388199806213),
 ('procedural generation', 0.06542225927114487)]

In [None]:
"""
    pre-normalizing will discard sentence length information
    this should ignore differences in numbe of tags specified for each game
    Pre-normalize doesnt matter if init_sims(replace=True) since it will
    precompute normalized vectors.
    Not clear what the point of post_normalize is. May be/not good for training
    the regression model down the line.
"""

tags_vectors = [
    model.wv.get_mean_vector(game, pre_normalize=False, post_normalize=False)
    for game in sentences
]

In [None]:
print('Number of games', len(tags_vectors))
tags_vectors[0]

Number of games 20194


array([-0.04236165, -0.0397155 ,  0.00718108,  0.03886624, -0.00631342,
       -0.07859799, -0.07925461, -0.00865282,  0.02961231,  0.01008357,
       -0.07318258,  0.02655539, -0.07126751,  0.04984082, -0.02577426,
       -0.02454956, -0.01926637, -0.03421877, -0.12328963, -0.01276981,
       -0.02784654,  0.00063332,  0.02324038,  0.04100714,  0.0610127 ,
       -0.01182295, -0.0062778 , -0.06118237,  0.03523558,  0.03787564,
        0.05221526,  0.05073253,  0.01472242,  0.05015956, -0.0454899 ,
        0.04029036,  0.085568  , -0.04404866,  0.02936773, -0.04897401,
       -0.02368658,  0.00813999, -0.07315706, -0.07155637, -0.00245235,
       -0.08552153,  0.02514972, -0.04393927,  0.14457056, -0.03062418,
       -0.02172336, -0.01135841,  0.1066888 ,  0.05915803,  0.04104951,
       -0.01435863, -0.02374155, -0.0715857 , -0.03217505, -0.00993358,
        0.0275089 , -0.02946498,  0.00268843, -0.10485699,  0.008396  ,
        0.07491563, -0.02871854,  0.02030951, -0.03381871,  0.00

In [None]:
w2vdf = pd.DataFrame(tags_vectors)
assert w2vdf.shape[1] == num_features
w2vdf.columns = [f'w2v_embed_{i}' for i in range(num_features)]
w2vdf.head()

Unnamed: 0,w2v_embed_0,w2v_embed_1,w2v_embed_2,w2v_embed_3,w2v_embed_4,w2v_embed_5,w2v_embed_6,w2v_embed_7,w2v_embed_8,w2v_embed_9,w2v_embed_10,w2v_embed_11,w2v_embed_12,w2v_embed_13,w2v_embed_14,w2v_embed_15,w2v_embed_16,w2v_embed_17,w2v_embed_18,w2v_embed_19,w2v_embed_20,w2v_embed_21,w2v_embed_22,w2v_embed_23,w2v_embed_24,w2v_embed_25,w2v_embed_26,w2v_embed_27,w2v_embed_28,w2v_embed_29,w2v_embed_30,w2v_embed_31,w2v_embed_32,w2v_embed_33,w2v_embed_34,w2v_embed_35,w2v_embed_36,w2v_embed_37,w2v_embed_38,w2v_embed_39,w2v_embed_40,w2v_embed_41,w2v_embed_42,w2v_embed_43,w2v_embed_44,w2v_embed_45,w2v_embed_46,w2v_embed_47,w2v_embed_48,w2v_embed_49,w2v_embed_50,w2v_embed_51,w2v_embed_52,w2v_embed_53,w2v_embed_54,w2v_embed_55,w2v_embed_56,w2v_embed_57,w2v_embed_58,w2v_embed_59,w2v_embed_60,w2v_embed_61,w2v_embed_62,w2v_embed_63,w2v_embed_64,w2v_embed_65,w2v_embed_66,w2v_embed_67,w2v_embed_68,w2v_embed_69,w2v_embed_70,w2v_embed_71,w2v_embed_72,w2v_embed_73,w2v_embed_74,w2v_embed_75,w2v_embed_76,w2v_embed_77,w2v_embed_78,w2v_embed_79,w2v_embed_80,w2v_embed_81,w2v_embed_82,w2v_embed_83,w2v_embed_84,w2v_embed_85,w2v_embed_86,w2v_embed_87,w2v_embed_88,w2v_embed_89,w2v_embed_90,w2v_embed_91,w2v_embed_92,w2v_embed_93,w2v_embed_94,w2v_embed_95,w2v_embed_96,w2v_embed_97,w2v_embed_98,w2v_embed_99
0,-0.042362,-0.039715,0.007181,0.038866,-0.006313,-0.078598,-0.079255,-0.008653,0.029612,0.010084,-0.073183,0.026555,-0.071268,0.049841,-0.025774,-0.02455,-0.019266,-0.034219,-0.12329,-0.01277,-0.027847,0.000633,0.02324,0.041007,0.061013,-0.011823,-0.006278,-0.061182,0.035236,0.037876,0.052215,0.050733,0.014722,0.05016,-0.04549,0.04029,0.085568,-0.044049,0.029368,-0.048974,-0.023687,0.00814,-0.073157,-0.071556,-0.002452,-0.085522,0.02515,-0.043939,0.144571,-0.030624,-0.021723,-0.011358,0.106689,0.059158,0.04105,-0.014359,-0.023742,-0.071586,-0.032175,-0.009934,0.027509,-0.029465,0.002688,-0.104857,0.008396,0.074916,-0.028719,0.02031,-0.033819,0.005963,-0.060155,0.049079,-0.045255,-0.02253,0.011159,0.12679,0.022675,0.021327,-0.039304,-0.025899,0.069128,-0.037236,-0.037639,0.086177,-0.039807,0.013869,-0.056876,-0.009923,-0.018413,0.033779,0.08069,0.1092,-0.025721,0.005915,0.110515,0.002868,0.013912,0.022002,0.060137,-0.030933
1,0.141421,-0.114861,0.103163,-0.003875,-0.160798,-0.043488,-0.028207,-0.079774,-0.014132,-0.093262,0.105763,0.165632,-0.028266,0.060118,-0.013317,-0.012456,-0.124812,0.025938,0.069481,0.123023,0.116414,0.062964,0.082994,-0.132317,-0.024684,-0.102951,0.121082,0.139341,-0.076092,0.020539,0.090965,0.032215,0.013992,-0.119998,0.175055,-0.129248,-0.107422,-0.100876,-0.113016,-0.062153,-0.167358,0.020975,-0.066767,-0.045422,0.102992,0.063211,0.032419,-0.024463,0.175853,0.005102,0.089292,0.160903,-0.173879,-0.162997,0.17666,0.048724,-0.03357,0.084105,-0.076829,-0.1139,-0.14701,0.128365,-0.043607,-0.134294,-0.105225,0.109945,-0.044147,-0.109416,-0.028806,-0.026234,-0.057947,-0.037196,0.073096,0.025754,-0.06214,0.047933,0.093093,-0.163628,-0.134564,0.112097,-0.159129,0.009404,-0.160121,0.149012,-0.07644,-0.117868,-0.024636,-0.019827,0.134114,0.162181,-0.006245,0.047955,0.084338,0.002542,-0.102904,-0.010583,0.038864,0.147799,-0.063956,-0.175284
2,-0.022132,0.025963,0.025433,0.057263,-0.038729,-0.043358,0.015618,0.046674,-0.100802,0.053062,0.022446,-0.031445,0.021297,0.024636,-0.046013,-0.013252,0.038371,-0.023731,-0.082074,-0.049806,0.079746,0.11147,-0.007608,-0.018769,-0.000774,0.015737,-0.008107,-0.044038,-0.029853,0.016194,0.026405,0.003051,0.071301,-0.050176,-0.069992,0.023157,-0.018712,-0.084169,-0.069598,-0.120828,0.019737,0.006466,-0.023048,-0.075087,-0.041482,-0.072998,-0.013727,-0.069711,-0.026005,-0.024772,-0.043327,0.063455,0.008009,0.01395,-0.005572,-0.050421,-0.089,-0.013039,-0.008816,0.081387,0.019219,-0.043354,-0.006124,-0.068702,-0.038574,0.09451,0.00293,0.020611,-0.037687,-0.012564,-0.019056,-0.017883,-0.046193,0.018912,-0.015693,0.051907,-0.019453,0.052071,0.036939,-0.028809,0.000434,-0.044225,-0.012253,0.068899,-0.07076,0.015082,0.009602,-0.011859,0.056794,0.08297,0.000406,0.025214,-0.039015,-0.041497,-0.055624,0.003352,-0.033368,0.059007,0.010047,0.040872
3,-0.063681,0.018388,0.021117,-0.006,0.003534,-0.138214,-0.037803,-0.023168,-0.009908,0.025597,-0.014868,0.014111,0.017484,-0.071208,-0.015068,-0.00875,-0.021569,-0.075605,-0.030413,-0.002495,0.017905,0.059934,-0.009237,0.046469,0.014646,0.072013,-0.002633,-0.075258,0.018646,0.037228,0.050467,0.071071,0.034646,0.058942,-0.003772,0.011981,0.031888,-0.054018,0.007346,-0.034751,0.049198,-0.019558,-0.118879,-0.120272,0.009276,-0.065154,0.019503,-0.074546,0.023452,-0.061129,-0.064132,-0.027595,0.07331,0.068802,-0.005793,-0.022784,-0.075463,-0.035321,-0.002963,0.053059,0.006244,-0.00192,-0.007207,-0.109374,0.030132,0.081118,0.042491,-0.03787,0.01541,-0.004702,-0.122395,0.002318,-0.07185,-0.06825,-0.023416,0.14141,0.008027,0.00025,-0.039582,0.003886,-0.011436,-0.10681,-0.011748,0.118608,-0.100813,0.009755,-0.029597,-0.038503,0.039548,-0.015545,0.100134,0.037164,0.012964,0.041032,0.026897,-0.020428,0.080449,-0.0057,0.003882,0.060704
4,-0.105113,0.046602,0.068389,-0.01554,0.050488,-0.114008,-0.053554,0.01454,-0.040209,-0.007903,-0.020027,-0.004494,-0.047387,-0.010693,0.022427,-0.002459,-0.029021,0.043505,-0.111444,-0.016489,0.011555,0.023755,0.060295,0.071686,-0.024349,0.037983,0.044975,-0.073465,0.02858,0.030301,0.052946,-0.032675,0.045339,0.038135,-0.028887,0.042944,0.079069,-0.059593,0.042253,-0.053752,0.034782,-0.026121,-0.082567,-0.023223,-0.010183,-0.085475,0.028828,-0.12428,0.093037,-0.007706,-0.023519,0.005729,0.076348,0.022473,0.032121,0.01351,-0.00814,-0.035666,-0.006778,-0.00628,-0.009116,-0.03236,-0.044648,-0.085198,0.031588,0.049246,0.018568,-0.009395,-0.046026,0.029738,-0.087649,-0.03814,-0.02214,0.031563,0.005549,0.133277,0.030024,0.013612,-0.081484,-0.042314,-0.007446,-0.080258,0.008051,0.084997,-0.109621,0.069639,-0.046049,-0.039817,-0.012025,0.014157,0.0624,0.032301,0.005769,-0.012577,0.070139,0.03253,-0.020943,-0.006377,0.0133,0.025031


#### Dropping Tags columns and merging embeds

In [None]:
""" Trick to prevent this from executing twice """
try:
    check_if_w2vdf_already_concat
except NameError:
    df.drop(columns=['Tags'])
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, w2vdf], axis=1)
    check_if_w2vdf_already_concat = 1

# del check_if_w2vdf_already_concat

In [None]:
df.shape

(20194, 92)

## Train/test data extraction + Regression model selection

The most important metrics when determinng a game's success include the number of estimated owners, peak ccu, number of pos/neg reveiws, and price.

In [None]:
y = np.array(df[['Estimated owners', 'Peak CCU', 'Positive', 'Negative', 'Price']])
X = np.array(df.drop(columns=['AppID', 'Name', 'Estimated owners', 'Peak CCU', 'Positive', 'Negative', 'Price', 'Reviews', 'Notes', 'Developers', 'Publishers', 'Tags'], axis=1))

print(X.shape)
print(y.shape)

(20194, 80)
(20194, 5)


In [None]:
#print(X[0,:]) # ensure all data is numerical

RandomForestRegressor is used to handle non-linear relationships between a game and the metrics we are predicting. MultiOutputRegressor provides easier setup for the model.

A grid search will also be done on the hyperparemeters for the random forest regressor.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 70% training data, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=52)
X_train = pca.fit_transform(X_train_scaled)
X_test = pca.transform(X_test_scaled)


print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(16155, 52) (16155, 5)
(4039, 52) (4039, 5)


In [None]:
# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components for 80% variance
n_components = np.argmax(cumulative_variance >= 0.875) + 1  # Add 1 because index starts at 0

print(f"Number of components to preserve 87.5% variance: {n_components}")

Number of components to preserve 87.5% variance: 52


Disclaimer, this cell takes hours to complete!

In [None]:
perform_search = False
grid_search = None
if perform_search == True:
      # perform a grid search on hyperparameters for random forest
      # -1 to utilize all processors and speed up training time
      rf = RandomForestRegressor(random_state=42, n_jobs=-1)
      model = MultiOutputRegressor(rf, n_jobs=-1)

      param_grid = [
      {'estimator__n_estimators': [20, 50, 100, 150, 200, 250],
       'estimator__max_features': [1, 20, 'sqrt', 50, 70, 90, 110],
       'estimator__max_depth': [None, 10, 20, 30, 40, 50]}
      ]

      grid_search = GridSearchCV(model, param_grid, n_jobs=-1)
      grid_search.fit(X_train, y_train)

      pickle_model("rf_gridsearch_obj", grid_search, grid_search.best_params_, param_grid[0])
      print(grid_search.best_params_)

Best hyperparams were a max branch depth of 50, a random subset of 70 features for splitting branches, and 150 estimators/trees for random forest. These parameters are the most infuential to model capacity, generalization, and computation. Other parameters like min_samples_split were ommitted from grid search since the default is adequte to recognize patterns in the data.

In [None]:
from sklearn.metrics import r2_score

if perform_search == True:
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mse)
    r2_score_values= r2_score(y_test, y_pred, multioutput='raw_values')

    # ['Estimated owners', 'Peak CCU', 'Positive', 'Negative', 'Price']
    print("Test set Mean Squared Error:", mse)
    print("Test set Root Mean Squared Error:", rmse)
    print("Test set R2 Score:", r2_score_values)

In [None]:
# load saved model
""" You can download from link in README """
loaded_grid = unpickle_model("rf_gridsearch_obj-50-70-150")
if perform_search == False:
    model = loaded_grid.best_estimator_
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mse)
    r2_score_values= r2_score(y_test, y_pred, multioutput='raw_values')

    # ['Estimated owners', 'Peak CCU', 'Positive', 'Negative', 'Price']
    print("Test set Mean Squared Error:", mse)
    print("Test set Root Mean Squared Error:", rmse)
    print("Test set R2 Score:", r2_score_values)

FileNotFoundError: [Errno 2] No such file or directory: './models/rf_gridsearch_obj-50-70-150.pkl.gz'

In [None]:
loaded_grid.best_params_

{'estimator__max_depth': 50,
 'estimator__max_features': 70,
 'estimator__n_estimators': 150}

## Train model with best parameters and transformed data (pca)

In [None]:
rf_pca = RandomForestRegressor(random_state=42, n_jobs=-1, max_depth=50, max_features=70, n_estimators=150, verbose=1)
model_pca = MultiOutputRegressor(rf_pca, n_jobs=-1)


model_pca.fit(X_train, y_train)
y_pred = model_pca.predict(X_test)

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
rmse = np.sqrt(mse)
r2_score_values= r2_score(y_test, y_pred, multioutput='raw_values')

# ['Estimated owners', 'Peak CCU', 'Positive', 'Negative', 'Price']
print("Test set Mean Squared Error:", mse)
print("Test set Root Mean Squared Error:", rmse)
print("Test set R2 Score:", r2_score_values)

Test set Mean Squared Error: [8.28342097e-05 1.27397253e-05 5.76919979e-05 1.57497508e-04
 1.41143950e-03]
Test set Root Mean Squared Error: [0.00910133 0.00356928 0.00759552 0.0125498  0.03756913]
Test set R2 Score: [0.65112391 0.14897018 0.91062271 0.64516019 0.30594863]


In [51]:
predict_labels = ["Estimated owners", "Peak CCU", "Positive", "Negative", "Price"]
calc: DataMinMax = DataMinMax()
for i, label in enumerate(predict_labels):
    calc.data[label] = {
        "min": df_orig[label].min(),
        "max": df_orig[label].max(),
        "r2": r2_score_values[i],
        "rmse": rmse[i],
    }
# owners = df_orig['Estimated owners']
# calc.owners = (min())
# du

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [50]:
# Convert the data into a DataFrame
rows = []
for label in predict_labels:

    fmt_int = lambda x: f"{x:,.0f}"
    fmt_float = lambda x, precision=2: f"{x:,.{precision}f}"
    min_val = calc.data[label]["min"]
    max_val = calc.data[label]["max"]
    rmse_val = calc.data[label]["rmse"]
    range_val = max_val - min_val
    range_percent = rmse_val * 100
    prediction = rmse_val*range_val
    rows.append({
        "Metric": label,
        "Prediction": fmt_float(prediction, 2),
        "Min": fmt_float(min_val,2),
        "Max": fmt_float(max_val),
        "RMSE": fmt_float(rmse_val, 4),
        "Range (%)": fmt_float(range_percent,2),
    })

# Create DataFrame
df_result = pd.DataFrame(rows)

# Display the DataFrame
df_result

Unnamed: 0,Metric,Prediction,Min,Max,RMSE,Range (%)
0,Estimated owners,697919.96,10000.0,75000000.0,0.0093,0.93
1,Peak CCU,4629.26,1.0,872138.0,0.0053,0.53
2,Positive,10371.33,0.0,964983.0,0.0107,1.07
3,Negative,1983.94,0.0,138530.0,0.0143,1.43
4,Price,10.21,0.35,269.99,0.0379,3.79


Looking at the R2 Score, which indicates how much of the variance the model is abble to predicy, the model is able to capture underlying patterns well for the estimated owners, positive number of reviews, and negative number of reviews. This suggests that relationships between the features and target variables are relatively strong, making them easier to predict.

This is logical. Game characteristics like developers, publishers, and categories will directly influence price and peak ccu  counts more so than the other target variables. Since these aren't taken into account during training to avoid too many feature encodings, the correlation between these characteristics makes them harder to predict. **This will help us assign a score to each prediction when defining a success rating.**