## Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

## Import Data

In [2]:
# Boardgame data
fullData = pd.read_csv("data/games.csv")
fullData.head()

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,1,Die Macher,die macher game seven sequential political rac...,1986,4.3206,7.61428,7.10363,1.57979,3,5,...,21926,21926,0,1,0,0,0,0,0,0
1,2,Dragonmaster,dragonmaster tricktaking card game base old ga...,1981,1.963,6.64537,5.78447,1.4544,3,4,...,21926,21926,0,1,0,0,0,0,0,0
2,3,Samurai,samurai set medieval japan player compete gain...,1998,2.4859,7.45601,7.23994,1.18227,2,4,...,21926,21926,0,1,0,0,0,0,0,0
3,4,Tal der Könige,triangular box luxurious large block tal der k...,1992,2.6667,6.60006,5.67954,1.23129,2,4,...,21926,21926,0,0,0,0,0,0,0,0
4,5,Acquire,acquire player strategically invest business t...,1964,2.5031,7.33861,7.14189,1.33583,2,6,...,21926,21926,0,1,0,0,0,0,0,0


## Drops columns

In [3]:
#Drop unneeded categories
bgData = fullData.drop("BGGId", axis=1)
bgData = bgData.drop("Name", axis=1)
bgData = bgData.drop("Description", axis=1)
bgData = bgData.drop("ImagePath", axis=1)
bgData = bgData.drop("Family", axis=1)
bgData = bgData.drop("GoodPlayers", axis=1) #Best players covers the same reason

#Data with null values, and not very useful
bgData = bgData.drop("ComAgeRec", axis=1)
bgData = bgData.drop("LanguageEase", axis=1)

#Drop rankings
target = bgData["AvgRating"]
bgData = bgData.drop("AvgRating", axis=1)
bgData = bgData.drop("StdDev", axis=1)
bgData = bgData.drop("BayesAvgRating", axis=1)
bgData = bgData.drop("Rank:childrensgames", axis=1)
bgData = bgData.drop("Rank:partygames", axis=1)
bgData = bgData.drop("Rank:cgs", axis=1)
bgData = bgData.drop("Rank:wargames", axis=1)
bgData = bgData.drop("Rank:thematic", axis=1)
bgData = bgData.drop("Rank:familygames", axis=1)
bgData = bgData.drop("Rank:abstracts", axis=1)
bgData = bgData.drop("Rank:strategygames", axis=1)
bgData = bgData.drop("Rank:boardgame", axis=1)

bgData.head()

Unnamed: 0,YearPublished,GameWeight,MinPlayers,MaxPlayers,BestPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,...,IsReimplementation,Kickstarted,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,1986,4.3206,3,5,5,7498,501,2039,761,240,...,0,0,0,1,0,0,0,0,0,0
1,1981,1.963,3,4,0,1285,72,191,54,30,...,1,0,0,1,0,0,0,0,0,0
2,1998,2.4859,2,4,3,15578,799,3450,1451,60,...,0,0,0,1,0,0,0,0,0,0
3,1992,2.6667,2,4,0,638,54,123,30,60,...,0,0,0,0,0,0,0,0,0,0
4,1964,2.5031,2,6,4,23735,548,2671,1606,90,...,0,0,0,1,0,0,0,0,0,0


In [4]:
bgData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21925 entries, 0 to 21924
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearPublished       21925 non-null  int64  
 1   GameWeight          21925 non-null  float64
 2   MinPlayers          21925 non-null  int64  
 3   MaxPlayers          21925 non-null  int64  
 4   BestPlayers         21925 non-null  int64  
 5   NumOwned            21925 non-null  int64  
 6   NumWant             21925 non-null  int64  
 7   NumWish             21925 non-null  int64  
 8   NumWeightVotes      21925 non-null  int64  
 9   MfgPlaytime         21925 non-null  int64  
 10  ComMinPlaytime      21925 non-null  int64  
 11  ComMaxPlaytime      21925 non-null  int64  
 12  MfgAgeRec           21925 non-null  int64  
 13  NumUserRatings      21925 non-null  int64  
 14  NumComments         21925 non-null  int64  
 15  NumAlternates       21925 non-null  int64  
 16  NumE

## Split data into training and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(bgData, target)

## Create a Random Forest Regressor Model

In [6]:
# Instantiating a Random Forest Regressor
clf = RandomForestRegressor()

clf.fit(X_train, y_train)

# Once fit is called, you can make predictions using predict()
y_preds = clf.predict(X_test)

# View preds/probabilities
y_preds

array([7.8553216, 6.750998 , 6.027681 , ..., 6.3100857, 6.6742624,
       6.5502659])

In [7]:
y_test

16817    7.72477
19302    8.46458
186      5.81768
15738    5.69260
5465     6.29459
          ...   
6506     6.35667
17176    6.10456
4556     6.62931
11440    7.76562
660      6.65921
Name: AvgRating, Length: 5482, dtype: float64

## Score the model

In [8]:
clf.score(X_test, y_test)

0.626436692241068

In [9]:
# scoring=None means default score() metric is used
print(cross_val_score(estimator=clf, 
                      X=bgData, 
                      y=target, 
                      cv=5, # use 5-fold cross-validation
                      scoring=None)) 

[0.48411644 0.56143182 0.5491594  0.53082587 0.34879804]
