We will try to build a basic system that gives some advice about which games are going to be interesting based on train data (some marks we gave to already completed games)

In [1]:
import numpy as np
import pandas as pd
import pickle

from io import StringIO
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

First, let's load the dataset and have a look at it

In [2]:
dataset = pd.read_csv('/kaggle/input/steam-games-dataset/dataset.csv')

pd.set_option('display.max_rows', 500)

# we drop this column for this particular task because 
# there are too few games that actually have this rating
dataset = dataset.drop('metacritic_rating', axis=1)

dataset

Unnamed: 0,id,name,year,reviewer_rating,positivity_ratio,to_beat_main,to_beat_extra,to_beat_completionist,extra_content_length,tags
0,96000,The Tiny Bang Story,2011.0,8.0,7.073879,3.60,3.60,3.77,0.17,Story Rich|Steampunk|Adventure|Atmospheric|Puz...
1,262410,World of Guns: Gun Disassembly,2014.0,8.0,5.208940,2.00,,28.07,26.07,Horror|First-Person|Historical|Multiplayer|Str...
2,1250410,Microsoft Flight Simulator 40th Anniversary Ed...,2020.0,6.0,3.581082,,,,,Multiplayer|Adventure|VR|Action Roguelike|Phys...
3,365450,Hacknet,2015.0,8.0,14.548520,7.06,8.73,10.75,3.69,Horror|Story Rich|Hacking|Crime|Multiplayer|Dy...
4,92800,SpaceChem,2011.0,8.0,11.440415,43.32,57.79,67.55,24.23,Automation|Strategy|Building|Puzzle|Science|Pr...
...,...,...,...,...,...,...,...,...,...,...
63538,521720,Uncharted Waters,2017.0,7.0,6.000000,27.88,29.86,,-27.88,Retro|RPG|Trading
63539,34311,Kid Chameleon™,2010.0,,6.000000,4.38,8.15,11.17,6.79,Retro|Platformer
63540,34289,Fatal Labyrinth™,2010.0,7.0,5.000000,4.00,4.58,4.60,0.60,RPG
63541,628150,Sangokushi Eiketsuden,2017.0,6.0,3.153846,,,,,RPG


Let's now start processing the dataset. The first logical step is to somehow turn the "tags" column into features. This is a good case for one-hot-encoding. We'll use MultiLabelBinarizer to complete the task

In [3]:
dataset['tags'] = dataset['tags'].str.split('|')
mlb = MultiLabelBinarizer()
one_hot_df = dataset.join(pd.DataFrame(mlb.fit_transform(dataset.pop('tags')),
                          columns=mlb.classes_,
                          index=dataset.index))
one_hot_df

Unnamed: 0,id,name,year,reviewer_rating,positivity_ratio,to_beat_main,to_beat_extra,to_beat_completionist,extra_content_length,1980s,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
0,96000,The Tiny Bang Story,2011.0,8.0,7.073879,3.60,3.60,3.77,0.17,0,...,0,0,0,0,0,0,0,0,0,0
1,262410,World of Guns: Gun Disassembly,2014.0,8.0,5.208940,2.00,,28.07,26.07,0,...,0,0,0,0,0,0,0,0,0,0
2,1250410,Microsoft Flight Simulator 40th Anniversary Ed...,2020.0,6.0,3.581082,,,,,0,...,0,0,0,0,0,0,0,0,0,0
3,365450,Hacknet,2015.0,8.0,14.548520,7.06,8.73,10.75,3.69,0,...,0,0,0,0,0,0,0,0,0,0
4,92800,SpaceChem,2011.0,8.0,11.440415,43.32,57.79,67.55,24.23,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63538,521720,Uncharted Waters,2017.0,7.0,6.000000,27.88,29.86,,-27.88,0,...,0,0,0,0,0,0,0,0,0,0
63539,34311,Kid Chameleon™,2010.0,,6.000000,4.38,8.15,11.17,6.79,0,...,0,0,0,0,0,0,0,0,0,0
63540,34289,Fatal Labyrinth™,2010.0,7.0,5.000000,4.00,4.58,4.60,0.60,0,...,0,0,0,0,0,0,0,0,0,0
63541,628150,Sangokushi Eiketsuden,2017.0,6.0,3.153846,,,,,0,...,0,0,0,0,0,0,0,0,0,0


Now we have a separate column for every tag with a 0 or 1 in it. It shows whether or not the particular game has a particular feature. Let's take a look at other columns. Their values vary a lot. It's probably a good idea to scale them. We will use MinMaxScaler to bring the values in numeric columns to a range of [0;1]. Also we need to fill the NaN values. We'll choose the simplest method and replace them with column mean values.

In [4]:
one_hot_df['reviewer_rating'].fillna(0, inplace=True)
one_hot_df['reviewer_rating'] = one_hot_df['reviewer_rating'] / 10

columns_to_scale = ['year', 'positivity_ratio', 'to_beat_main', 'to_beat_extra',
                    'to_beat_completionist', 'extra_content_length']

for column in columns_to_scale:
  if column== 'positivity_ratio':
    one_hot_df['positivity_ratio'].fillna(0, inplace=True)
  else:
    one_hot_df[column].fillna(one_hot_df[column].mean(), inplace=True)
  one_hot_df[column] = MinMaxScaler().fit_transform(one_hot_df[[column]])

one_hot_df

Unnamed: 0,id,name,year,reviewer_rating,positivity_ratio,to_beat_main,to_beat_extra,to_beat_completionist,extra_content_length,1980s,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
0,96000,The Tiny Bang Story,0.538462,0.8,0.011969,0.001705,0.000549,0.000245,0.120902,0,...,0,0,0,0,0,0,0,0,0,0
1,262410,World of Guns: Gun Disassembly,0.653846,0.8,0.008814,0.000945,0.003073,0.001831,0.122389,0,...,0,0,0,0,0,0,0,0,0,0
2,1250410,Microsoft Flight Simulator 40th Anniversary Ed...,0.884615,0.6,0.006059,0.004504,0.003073,0.002241,0.121927,0,...,0,0,0,0,0,0,0,0,0,0
3,365450,Hacknet,0.692308,0.8,0.024617,0.003348,0.001338,0.000701,0.121104,0,...,0,0,0,0,0,0,0,0,0,0
4,92800,SpaceChem,0.538462,0.8,0.019358,0.020567,0.008886,0.004409,0.122284,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63538,521720,Uncharted Waters,0.769231,0.7,0.010152,0.013235,0.004589,0.002241,0.119292,0,...,0,0,0,0,0,0,0,0,0,0
63539,34311,Kid Chameleon™,0.500000,0.0,0.010152,0.002075,0.001249,0.000728,0.121282,0,...,0,0,0,0,0,0,0,0,0,0
63540,34289,Fatal Labyrinth™,0.500000,0.7,0.008460,0.001895,0.000700,0.000299,0.120927,0,...,0,0,0,0,0,0,0,0,0,0
63541,628150,Sangokushi Eiketsuden,0.769231,0.6,0.005336,0.004504,0.003073,0.002241,0.121927,0,...,0,0,0,0,0,0,0,0,0,0


Now let's move to train set. We'll use a sample train set with two columns: a game name and a mark from 1 to 10, which shows how enjoyable was the game.

In [5]:
SAMPLE_TEST_SET = """
name;y
Adam Wolfe;7
Alan Wake;8
Alpha Protocol™;9
Assassin's Creed 2;7
Assassin's Creed™: Director's Cut Edition;7
Back to the Future: Ep 2 - Get Tannen!;8
Back to the Future: Ep 3 - Citizen Brown;8
Back to the Future: Ep 4 - Double Visions;8
Back to the Future: The Game;8
Bastion;9
Batman: Arkham Asylum Game of the Year Edition;7
Batman: Arkham City - Game of the Year Edition;7
Batman: Arkham City;7
BioShock™ Remastered;7
BioShock™;7
Borderlands Game of the Year Enhanced;8
Borderlands Game of the Year;8
Botanicula;7
Brothers - A Tale of Two Sons;10
Brutal Legend;9
Call of Juarez;6
Crashday Redline Edition;7
Crysis;8
Dead Space (2008);9
Dead Space;9
Dead State: Reanimated;8
DeathSpank: Thongs of Virtue;5
Deponia;9
Deus Ex: Game of the Year Edition;8
Disney•Pixar Cars 2: The Video Game;6
Don't Escape: 4 Days to Survive;9
Dragon Age: Origins;9
Dragon Age™: Origins Awakening;8
Drakensang: The River of Time;8
Dreamfall: The Longest Journey;9
Dreamfall Chapters;6
Duke Nukem Forever;5
Exorder;5
Fahrenheit: Indigo Prophecy Remastered;9
Fallout: A Post Nuclear Role Playing Game;8
Far Cry®;6
Firewatch;9
FlatOut 2™;9
Frozen Cortex;8
Game of Thrones - A Telltale Games Series;8
Gatling Gears;5
Gone Home;4
Gothic 1;5
Grand Theft Auto V;8
Grotesque Tactics 2 – Dungeons and Donuts;8
Halfway;9
Hunted: The Demon’s Forge™;8
INSIDE;9
Into the Breach;8
J.U.L.I.A.: Among the Stars;9
Judgment: Apocalypse Survival Simulation;8
L.A. Noire;8
LIMBO;8
Legend of Grimrock;8
Life is Strange - Episode 1;9
Life is Strange Remastered;9
Life is Strange: Before the Storm Remastered;7
Life is Strange: Before the Storm;7
LISA: The Painful;2
Lost Horizon;7
Mars: War Logs;8
Mass Effect (2007);9
Mass Effect 2 (2010 Edition);10
Mass Effect 2 (2010) Edition;10
Mass Effect™ 3 N7 Digital Deluxe Edition (2012);10
Mass Effect™ Legendary Edition;10
Max Payne 2: The Fall of Max Payne;7
Max Payne 3;7
Max Payne RU;8
Max Payne;8
Memento Mori;9
Metro 2033 Redux;8
Never Again;2
Omerta - City of Gangsters;9
Oxenfree;9
Phantom Doctrine;8
Prey;9
Primordia;8
Prototype™;7
S.T.A.L.K.E.R.: Shadow of Chernobyl;9
SOMA;10
Saints Row 2;6
Serious Sam Classic: The First Encounter;4
Serious Sam HD: The First Encounter;4
Shadowrun: Dragonfall - Director's Cut;8
Shadowrun Returns;9
Sherlock Holmes: Crimes and Punishments;8
Sherlock Holmes: The Devil's Daughter;7
Sleeping Dogs;8
Sleeping Dogs: Definitive Edition;8
South Park™: The Stick of Truth™;3
Spaceland: Sci-Fi Indie Tactics;3
Spec Ops: The Line;8
STAR WARS™ Knights of the Old Republic™;8
STAR WARS™ Knights of the Old Republic™ II - The Sith Lords™;6
Stellaris;8
Sunrider: Mask of Arcadius;3
Syberia II;8
Syberia;8
Tales from the Borderlands;9
The Book of Unwritten Tales 2;8
The Book of Unwritten Tales: The Critter Chronicles;8
The Book of Unwritten Tales;8
The Bureau: XCOM Declassified;9
The First Templar - Steam Special Edition;7
The Longest Journey;9
The Tiny Bang Story;7
The Walking Dead: A New Frontier;9
The Walking Dead: Michonne - A Telltale Miniseries;8
The Walking Dead: Season Two;9
The Walking Dead;10
The Witcher: Enhanced Edition Director's Cut;7
The Wolf Among Us;9
Tomb Raider;8
Transistor;7
UFO: Afterlight;9
Undertale;9
Valiant Hearts: The Great War™ / Soldats Inconnus : Mémoires de la Grande Guerre™;10
Vampire: The Masquerade - Bloodlines;9
Vaporum;8
Wasteland 2: Director's Cut;9
Welcome to Bummertown;8
X-COM: UFO Defense;9
XCOM: Enemy Unknown;10
XCOM® 2;10
XCOM®: Chimera Squad;8
Yesterday;8
Zanzarah: The Hidden Portal;8
"""

In [6]:
train_raw = pd.read_csv(StringIO(SAMPLE_TEST_SET), sep=';')
train_raw

Unnamed: 0,name,y
0,Adam Wolfe,7
1,Alan Wake,8
2,Alpha Protocol™,9
3,Assassin's Creed 2,7
4,Assassin's Creed™: Director's Cut Edition,7
5,Back to the Future: Ep 2 - Get Tannen!,8
6,Back to the Future: Ep 3 - Citizen Brown,8
7,Back to the Future: Ep 4 - Double Visions,8
8,Back to the Future: The Game,8
9,Bastion,9


Now let's merge our train data with the dataset and see what we get

In [7]:
train_with_info = pd.merge(one_hot_df, train_raw, on='name').drop('id', axis=1)
train_with_info

Unnamed: 0,name,year,reviewer_rating,positivity_ratio,to_beat_main,to_beat_extra,to_beat_completionist,extra_content_length,1980s,1990's,...,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports,y
0,The Tiny Bang Story,0.538462,0.8,0.011969,0.001705,0.000549,0.000245,0.120902,0,0,...,0,0,0,0,0,0,0,0,0,7
1,Undertale,0.692308,0.9,0.049315,0.003205,0.001674,0.001357,0.121699,0,0,...,0,0,0,0,0,0,0,0,0,9
2,LIMBO,0.538462,0.8,0.023526,0.001695,0.000635,0.000426,0.121062,0,0,...,0,0,0,0,0,0,0,0,0,8
3,Bastion,0.538462,0.9,0.036343,0.002963,0.001414,0.001327,0.121702,0,0,...,0,0,0,0,0,0,0,0,0,9
4,Fallout: A Post Nuclear Role Playing Game,0.461538,0.8,0.025091,0.00765,0.003526,0.002118,0.121831,0,0,...,0,0,0,0,0,0,0,0,0,8
5,Transistor,0.653846,0.8,0.028094,0.002844,0.001275,0.001018,0.121444,0,0,...,0,0,0,0,0,0,0,0,0,7
6,Deponia,0.576923,0.8,0.011832,0.003861,0.001494,0.000688,0.121031,0,0,...,0,0,0,0,0,0,0,0,0,9
7,Botanicula,0.576923,0.9,0.039929,0.001653,0.000588,0.00033,0.120984,0,0,...,0,0,0,0,0,0,0,0,0,7
8,LISA: The Painful,0.653846,0.9,0.043178,0.005186,0.002328,0.001649,0.121716,0,0,...,0,0,0,0,0,0,0,0,0,2
9,The Longest Journey,0.384615,0.8,0.016123,0.008301,0.003194,0.001418,0.121136,0,0,...,0,0,0,0,0,0,0,0,0,9


We almost have a fully prepared train set. Although there is one important problem. Right now we have more columns than rows (the number of features is greater than the number of observations). Classic models aren't going to be very effective in this case. It's probably a good idea to decrease the number of features. Let's try to figure out which features are underrepresented in our train set and drop the corresponding columns.

In [8]:
# these are the columns we will not drop no matter what
fixed_column_names = ['name', 'year', 'reviewer_rating', 'positivity_ratio',
                      'to_beat_main', 'to_beat_extra', 'to_beat_completionist',
                      'extra_content_length', 'y']
feature_cutoff = 0
# find out how many games have particular features
column_stats = train_with_info.drop(fixed_column_names, axis=1).sum(axis=0)[1:].sort_values()

while feature_cutoff < len(train_with_info):
    good_columns = column_stats[column_stats >= feature_cutoff]
    if len(good_columns) + len(fixed_column_names) < len(train_with_info):
        # we've found out a cutoff value that keeps the number of features less than number of games
        good_column_names = list(good_columns.index)
        break
    else:
        # we still have too many features
        feature_cutoff += 1
        
print(f'Feature cutoff stopped at {feature_cutoff}')
print(f'{len(good_columns) + len(fixed_column_names)} features were preserved')

for (i, column) in enumerate(fixed_column_names):
  if column == 'y':
    continue
  good_column_names.insert(i, column)
good_column_names.append('y')

train_with_good_features = train_with_info[good_column_names]
train_with_good_features


Feature cutoff stopped at 5
134 features were preserved


Unnamed: 0,name,year,reviewer_rating,positivity_ratio,to_beat_main,to_beat_extra,to_beat_completionist,extra_content_length,Dating Sim,Side Scroller,...,Third Person,Sci-fi,RPG,Great Soundtrack,Story Rich,Action,Atmospheric,Adventure,Singleplayer,y
0,The Tiny Bang Story,0.538462,0.8,0.011969,0.001705,0.000549,0.000245,0.120902,0,0,...,0,0,0,1,1,0,1,1,1,7
1,Undertale,0.692308,0.9,0.049315,0.003205,0.001674,0.001357,0.121699,1,0,...,0,0,1,1,1,0,0,0,1,9
2,LIMBO,0.538462,0.8,0.023526,0.001695,0.000635,0.000426,0.121062,0,1,...,0,0,0,0,0,1,1,1,1,8
3,Bastion,0.538462,0.9,0.036343,0.002963,0.001414,0.001327,0.121702,0,0,...,0,0,1,1,1,1,1,1,1,9
4,Fallout: A Post Nuclear Role Playing Game,0.461538,0.8,0.025091,0.00765,0.003526,0.002118,0.121831,0,0,...,0,1,1,0,0,0,1,1,1,8
5,Transistor,0.653846,0.8,0.028094,0.002844,0.001275,0.001018,0.121444,0,0,...,0,1,1,1,1,1,1,1,1,7
6,Deponia,0.576923,0.8,0.011832,0.003861,0.001494,0.000688,0.121031,0,0,...,0,1,0,1,1,0,1,1,1,9
7,Botanicula,0.576923,0.9,0.039929,0.001653,0.000588,0.00033,0.120984,0,0,...,0,0,0,1,1,0,1,1,1,7
8,LISA: The Painful,0.653846,0.9,0.043178,0.005186,0.002328,0.001649,0.121716,0,1,...,0,0,1,1,1,0,1,1,1,2
9,The Longest Journey,0.384615,0.8,0.016123,0.008301,0.003194,0.001418,0.121136,0,0,...,1,1,1,1,1,1,1,1,1,9


We are all set! Now let's make some final preparations.

In [9]:
names = train_with_good_features['name']
y = train_with_good_features['y']
X = train_with_good_features.drop('name', axis=1).drop('y', axis=1)

In [10]:
!pip install catboost

[0m

We'll make a simple function that checks the quality of a model

In [11]:
from sklearn.model_selection import cross_val_score

def get_metric(model):
  model.fit(X, y)
  accuracies = cross_val_score(model, X, y = y)
  return sum(accuracies) / len(accuracies)

For demonstration purposes we'll take several basic regression models and check their performance at default settings

In [12]:
from sklearn.linear_model import (SGDRegressor, LinearRegression, Ridge,
                                  Lasso, BayesianRidge,
                                  TweedieRegressor, ElasticNet,
                                  PassiveAggressiveRegressor,
                                  ARDRegression, HuberRegressor)
from sklearn.ensemble import (HistGradientBoostingRegressor, AdaBoostRegressor,
                              RandomForestRegressor, ExtraTreesRegressor,
                              GradientBoostingRegressor, VotingRegressor,
                              StackingRegressor)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = [('Linear Regression', LinearRegression()),
          ('Ridge', Ridge(random_state=0)),
          ('Lasso', Lasso(random_state=0)),
          ('BayesianRidge', BayesianRidge()),
          ('TweedieRegressor', TweedieRegressor()),
          ('ElasticNet', ElasticNet(random_state=0)),
          ('SGDRegressor', SGDRegressor(random_state=0)),
          ('PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state=0)),
          ('ARDRegression',  ARDRegression()),
          ('HuberRegressor', HuberRegressor(max_iter=10000)),
          ('GradientBoostingRegressor', GradientBoostingRegressor(random_state=0)),
          ('KernelRidge', KernelRidge()),
          ('SVR', SVR()),
          ('NuSVR', NuSVR()),
          ('LinearSVR', LinearSVR(max_iter=10000, random_state=0)),
          ('GaussianProcessRegressor', GaussianProcessRegressor(random_state=0)),
          ('DecisionTreeRegressor', DecisionTreeRegressor(random_state=0)),
          ('RandomForestRegressor', RandomForestRegressor(random_state=0)),
          ('ExtraTreesRegressor', ExtraTreesRegressor(random_state=0)),
          ('AdaBoostRegressor', AdaBoostRegressor(random_state=0)),
          ('HistGradientBoostingRegressor',  HistGradientBoostingRegressor(random_state=0)),
          ('GradientBoostingRegressor', GradientBoostingRegressor(random_state=0)),
          ('MLPRegressor', MLPRegressor(max_iter=10000, random_state=0)),
          ('CatBoostRegressor', CatBoostRegressor(silent=True, random_state=0)),
          ('XGBRegressor', XGBRegressor(random_state=0)),
          ('LGBMRegressor', LGBMRegressor(random_state=0))]

results = []
for model in models:
  print(f'running {model[0]}')
  result = get_metric(model[1])
  results.append((model[0], result))
  results = sorted(results, key=lambda x: x[1], reverse=True)
  if ((model[0], result) in results[:5]):
    print(f'NEW BEST: {model[0]} WITH {result}')

results

running Linear Regression
NEW BEST: Linear Regression WITH -7.351022214722036
running Ridge
NEW BEST: Ridge WITH -1.1139019871283196
running Lasso
NEW BEST: Lasso WITH -0.017856626150436528
running BayesianRidge
NEW BEST: BayesianRidge WITH -0.04156840273819331
running TweedieRegressor
NEW BEST: TweedieRegressor WITH 0.007982920822204998
running ElasticNet
NEW BEST: ElasticNet WITH -0.017856626150436528
running SGDRegressor
NEW BEST: SGDRegressor WITH -0.593784571718609
running PassiveAggressiveRegressor
running ARDRegression
running HuberRegressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


running GradientBoostingRegressor
running KernelRidge
running SVR
NEW BEST: SVR WITH 0.01965680224059303
running NuSVR
NEW BEST: NuSVR WITH 0.039253295811496926
running LinearSVR
running GaussianProcessRegressor
running DecisionTreeRegressor
running RandomForestRegressor
running ExtraTreesRegressor
running AdaBoostRegressor
running HistGradientBoostingRegressor
running GradientBoostingRegressor
running MLPRegressor
running CatBoostRegressor
running XGBRegressor
running LGBMRegressor


[('NuSVR', 0.039253295811496926),
 ('SVR', 0.01965680224059303),
 ('TweedieRegressor', 0.007982920822204998),
 ('Lasso', -0.017856626150436528),
 ('ElasticNet', -0.017856626150436528),
 ('BayesianRidge', -0.04156840273819331),
 ('CatBoostRegressor', -0.04719565746781742),
 ('AdaBoostRegressor', -0.06547390753879748),
 ('HistGradientBoostingRegressor', -0.11210412817299009),
 ('LGBMRegressor', -0.1434006065091754),
 ('RandomForestRegressor', -0.3679295365038864),
 ('SGDRegressor', -0.593784571718609),
 ('GradientBoostingRegressor', -0.8361904739637982),
 ('GradientBoostingRegressor', -0.8361904739637982),
 ('Ridge', -1.1139019871283196),
 ('LinearSVR', -1.1619856675020535),
 ('XGBRegressor', -1.1748967317110157),
 ('ExtraTreesRegressor', -1.201611603897721),
 ('KernelRidge', -1.3310038248680829),
 ('DecisionTreeRegressor', -1.418987805138407),
 ('MLPRegressor', -1.9116736585856509),
 ('PassiveAggressiveRegressor', -3.0981665295228016),
 ('ARDRegression', -4.1833761749617455),
 ('HuberRe

As we can see, the NuSVR regressor shows the best results. Although the nature of our data and features call for some modifications. We can tune the hyperparameters further but for demonstration let's stop at this set:

In [13]:
model = NuSVR(kernel='poly', degree=6)
get_metric(model)

0.0710030214640335

Now let's see how well the model fit the data

In [14]:
for name, predict, value in zip(names, model.predict(X), y):
    print(f'{name} - predicted {predict}, actual {value}')

The Tiny Bang Story - predicted 7.000176882611326, actual 7
Undertale - predicted 8.999644989601174, actual 9
LIMBO - predicted 8.000221869044703, actual 8
Bastion - predicted 8.999621958849627, actual 9
Fallout: A Post Nuclear Role Playing Game - predicted 7.999921529020748, actual 8
Transistor - predicted 6.999946741342952, actual 7
Deponia - predicted 8.99986179708746, actual 9
Botanicula - predicted 7.000244142547687, actual 7
LISA: The Painful - predicted 4.409186978648192, actual 2
The Longest Journey - predicted 8.999850145987716, actual 9
Yesterday - predicted 7.999603848464128, actual 8
Yesterday - predicted 7.967324663601281, actual 8
Primordia - predicted 7.999897412602434, actual 8
Don't Escape: 4 Days to Survive - predicted 8.999931894290356, actual 9
Lost Horizon - predicted 7.818979621155854, actual 7
Judgment: Apocalypse Survival Simulation - predicted 7.999886009448895, actual 8
Halfway - predicted 8.522346329131487, actual 9
Welcome to Bummertown - predicted 8.0003836

The model did a fairly good job and now we can make some predictions and see what is recomended

In [15]:
good_column_names.remove('y')
one_hot_df_trimmed = one_hot_df[good_column_names]

test_names = one_hot_df_trimmed['name']
test_X = one_hot_df_trimmed.drop('name', axis=1)

predictions = model.predict(test_X)
names_with_preds = list(zip(test_names, predictions))
names_with_preds = sorted(names_with_preds, key=lambda x: x[1], reverse=True)
for name, score in names_with_preds[:100]:
    if name not in train_with_info.name.values:
      print(f'{name} - {score}')

Mass Effect™: Andromeda Deluxe Edition - 8.883240554882935
Old World - 8.781372880802365
The Walking Dead: The Final Season - 8.606596157228328
"Warhammer 40,000: Space Wolf" - 8.59458134266501
Templar Battleforce - 8.575027431385333
Empire of Sin - 8.492079847208457
Scenner - 8.455187725020732
S.T.A.L.K.E.R.: Call of Pripyat - 8.425502721557129
Deponia: The Complete Journey - 8.41319224876878
RAM Pressure - 8.390250781883664
Everreach: Project Eden - 8.385024375072453
Batman: The Enemy Within - The Telltale Series - 8.351988958221813
Starless - 8.3492860501441
Eternal Starshine - 8.33788468101319
Robothorium - 8.325999667472153
Marvel's Guardians of the Galaxy - 8.316195763092146
Galaxy Squad - 8.309027005535498
Dead Space™ 2 - 8.302152873646978
Towertale - 8.300189908349308
Xenonauts - 8.287635522230268
Clanfolk - 8.265015667592705
Mainframe Defenders - 8.262125046076246
Star Traders: Frontiers - 8.259568664775081
Airtight City 密闭之城1.0 - 8.256706657604987
Shadow Empire - 8.2400471366