In [1]:
#First import the relevant packages
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV
from datetime import date

#import dataset
df = pd.read_csv('steam.csv') #import the file. If the file is not in the same location as this script you need to write out the whole path
df.head() #Check the first 5 entries


ModuleNotFoundError: No module named 'scikeras'

In [None]:
#Prepare the data
df = df.dropna() #remove missings

#Create variable that changes releaseDate into days since 1-1-2024
d0 = pd.to_datetime('2024-01-01')
df['releaseDate'] = pd.to_datetime(df['releaseDate'],dayfirst=True)
df['age'] = (df['releaseDate'] - d0).dt.days

#select Target and features
y = df['copiesSold'] 
X = df.drop(['copiesSold','name','releaseDate','revenue','publishers','developers','steamId'],axis=1) 

#create dummies and normalize data
X = pd.get_dummies(X, drop_first=True) #create dummy variables
columns = X.columns #create index with column names (needed for last step)
scaler = MinMaxScaler() #initiate the scaler
X = scaler.fit_transform(X) #scale the data
X = pd.DataFrame(X,columns=columns) #turn back into a dataframe


In [None]:
#Prediction with linear regression

#define the cross-validation
cv = RepeatedKFold(n_splits=5, random_state=420) #splits the data into 10 folds and does the analysis 10 times

#predict with linear regression
lm = LinearRegression()
lmpred = cross_val_score(lm, X, y, scoring='neg_mean_absolute_error', cv=cv) 

MAElm = mean(absolute(lmpred)) #Calculate the overall mean absolute error

print('the average prediction error for linear regression is: %.0f' % MAElm) #Print the result

the average prediction error for linear regression is: 214926


In [None]:
#grid search with Lasso
alpha_range = np.arange(start=1,stop=10000,step=100) #define a range of alpha values

#define the aspects of the grid search
cv = RepeatedKFold(n_splits=5) #define the cross validation
param = {'alpha':alpha_range} #create a dictionary that contains the range of alpha values
LassoModel = Lasso() #define the model

#create the grid search
LassoM = GridSearchCV(LassoModel, #the lasso model we defined above
                      param_grid=param, #the alpha values we defined above
                      scoring ='neg_mean_absolute_error', #use MAE as measure of fit
                      cv=cv) #the cross validation we defined above
LassoM.fit(X,y)# Fit the random search model
print("Best alpha: ", LassoM.best_params_['alpha']) #print out the best parameter

Best alpha:  3301


In [None]:
alpha_range = np.arange(start=1,stop=100,step=1) #define a range of 100 alpha values

#define the aspects of the grid search
cv = RepeatedKFold(n_splits=5) #define the cross validation
param = {'alpha':alpha_range} #create a dictionary that contains the range of alpha values
RidgeModel = Ridge() #define the model

#create the grid search
RidgeM = GridSearchCV(RidgeModel, #the lasso model we defined above
                      param_grid=param, #the alpha values we defined above
                      scoring ='neg_mean_absolute_error', #use MAE as measure of fit
                      cv=cv) #the cross validation we defined above
RidgeM.fit(X,y)# Fit the random search model
print("Best alpha: ", RidgeM.best_params_['alpha']) #print out the best parameter

Best alpha:  27


In [None]:
LassoModel = Lasso(alpha=3301)
RidgeModel = Ridge(alpha=27)

#Run the cross fold validation again (same as earlier)
cv = RepeatedKFold(n_splits=5)
scoreslasso = cross_val_score(LassoModel, X, y, scoring='neg_mean_absolute_error', cv=cv) 
scoresridge = cross_val_score(RidgeModel, X, y, scoring='neg_mean_absolute_error', cv=cv) 

#Evaluate the model
print('the average prediction error of linear regression was 214926')
lassoMAE = mean(absolute(scoreslasso)) #Calculate the overall mean absolute error
print('the average prediction error with lasso is: %.0f' % lassoMAE) #Print the result
ridgeMAE = mean(absolute(scoresridge)) #Calculate the overall mean absolute error
print('the average prediction error with ridge is: %.0f' % ridgeMAE) #Print the result

the average prediction error of linear regression was 214926
the average prediction error with lasso is: 210943
the average prediction error with ridge is: 213472


In [None]:
#Hyperparameter tuning (warning: takes at least 18 minutes to run)
# Define the function to create the model
def build_model(n_layers=3, n_units=64, activation='softplus'): #start with some inital parameters
    nnmodel = tf.keras.models.Sequential() #define the model
    nnmodel.add(tf.keras.layers.Input(shape=(X.shape[1],)))  # Input layer

    for _ in range(n_layers): #create a for loop that goes through the number of layers, number of units, and activation types
        nnmodel.add(tf.keras.layers.Dense(n_units, activation=activation))
    
    nnmodel.add(tf.keras.layers.Dense(1))  # Output layer
    nnmodel.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae']) #compile the model and use mean absolute error as measure of fit
    return nnmodel

model = KerasRegressor(model=build_model, #for some reason you have to put the parameter distribution in here, otherwise it will give an error
                       n_layers=[1, 2, 3, 4, 5], #Number of hidden layers
                       n_units=[32, 64, 128, 256, 512], #Number of nodes per hidden layer
                       activation=['relu', 'tanh', 'softplus', 'elu'], #activation function
                       epochs=[50, 100, 150, 200], #number of epochs
                       batch_size=[16, 32, 64], #number of batches
                       verbose = 0) #this will shorten the output

param_dist = { #overview of the parameter distribution (this is the same as above)
    'n_layers': [1, 2, 3, 4, 5],
    'n_units': [32, 64, 128, 256, 512],
    'activation': ['relu', 'tanh', 'softplus', 'elu'],
    'epochs': [50, 100, 150, 200],
    'batch_size': [16, 32, 64]
}

random_search = RandomizedSearchCV(estimator=model, #randomly search through the grid
                                   param_distributions=param_dist,
                                    n_iter=10, #number of parameter combinations that are randomly chosen
                                      cv=5, #evaluate the output using 5-fold cross validation 
                                    random_state=42) #set a random seed so that the results will be the same each time

random_search.fit(X, y) #train the models
print("Best parameters found: ", random_search.best_params_) #print the best parameters

Best parameters found:  {'n_units': 32, 'n_layers': 2, 'epochs': 100, 'batch_size': 64, 'activation': 'elu'}


In [None]:
#cross validation with the best parameters
def create_baseline():
    # create model
    nnmodel = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(X.shape[1],)),  # Input layer (the shape parts counts how many features you have)
        tf.keras.layers.Dense(32, activation='elu'), #hidden layer
        tf.keras.layers.Dense(32, activation='elu'), #hidden layer
        tf.keras.layers.Dense(1)  # Output layer
    ])
    # Compile model
    nnmodel.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae']) #compile the model based 
    return nnmodel
# evaluate model  (note if you get an error, upgrade the packages! module not found keras.api_v2 error was because of tensorflow!)
estimator = KerasRegressor(build_fn=create_baseline, epochs=100, batch_size=64,verbose=0) #you have to use the kerasRegressor wrapper for nn models
kfold = RepeatedKFold(n_splits=5, n_repeats = 1) #I'm using just 5 splits and 1 repeat to save time, but more splits and repeats will be more accurate
results = cross_val_score(estimator, X, y, cv=kfold,scoring='neg_mean_absolute_error') #create the results of the cross validation
print('the average prediction error is: %.0f' % mean(absolute(results))) #print the average error

  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)


the average prediction error is: 136601
