In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
footballers = pd.read_csv('final_data.csv')

<h1>Train Test Split</h1>

<b>I divide the data into a training and test set. I use the random_state embryo to make sure that the test set will always the same if I restart programm. Thanks of this there won't no situation in which the machine learning algorithm will see new data previously belonging to the test set and after some time will know entire data set.</b>

In [3]:
X = footballers.drop("Value(mln €)", axis=1)
y = footballers["Value(mln €)"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h1 style="text-align:center">Training and evaluation of various regression models</h1>

<p style="text-align:center"><b>Selection of performance metrics</b></p></br>
<p style="text-align:center">I choosed RMSE (root mean square error) and MAE (mean absolute error) which are standard metrics for regression issues.</p>

<h2>Linear Regression</h2>

In [4]:
from sklearn.linear_model import LinearRegression
lin_reg_model = LinearRegression()

In [5]:
lin_reg_model.fit(X_train, y_train) # training the model 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
linear_predictions = lin_reg_model.predict(X_test) # model prediction

In [7]:
# Creating and printing metrics
lin_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))
lin_mae = mean_absolute_error(y_test, linear_predictions)
print("RMSE: ",lin_rmse, "\nMAE: ",lin_mae)

RMSE:  2.575182918983292 
MAE:  1.402345048432254


<h2>SVR</h2>

In [8]:
from sklearn.svm import LinearSVR
svr_reg_model = LinearSVR(epsilon=0.5)

In [9]:
svr_reg_model.fit(X_train, y_train)



LinearSVR(C=1.0, dual=True, epsilon=0.5, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [10]:
svr_predictions = svr_reg_model.predict(X_test)

In [11]:
svr_rmse = np.sqrt(mean_squared_error(y_test, svr_predictions))
svr_mae = mean_absolute_error(y_test, svr_predictions)
print("RMSE: ",svr_rmse, "\nMAE: ",svr_mae)

RMSE:  9.311765545733701 
MAE:  7.453622235394366


<h2>Decision Tree</h2>

In [12]:
from sklearn.tree import DecisionTreeRegressor
tree_reg_model = DecisionTreeRegressor()

In [13]:
tree_reg_model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [14]:
tree_predictions = tree_reg_model.predict(X_test)

In [15]:
tree_rmse = np.sqrt(mean_squared_error(y_test, tree_predictions))
tree_mae = mean_absolute_error(y_test, tree_predictions)
print("RMSE: ",tree_rmse, "\nMAE: ",tree_mae)

RMSE:  1.2520297339815334 
MAE:  0.1862437185929649


<h2>Random Forest</h2>

In [16]:
from sklearn.ensemble import RandomForestRegressor
forest_reg_model = RandomForestRegressor()

In [17]:
forest_reg_model.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
forest_predictions = forest_reg_model.predict(X_test)

In [19]:
forest_rmse = np.sqrt(mean_squared_error(y_test, forest_predictions))
forest_mae = mean_absolute_error(y_test, forest_predictions)
print("RMSE: ",forest_rmse, "\nMAE: ",forest_mae)

RMSE:  0.9992873636940944 
MAE:  0.15381375628140706


<p style="text-align:center"><b>Results summary</b></p></br>

In [45]:
print("Linear Regression\nRMSE: ",round(lin_rmse,2), "\nMAE: ",round(lin_mae,2))
print("SVR\nRMSE: ",round(svr_rmse,2), "\nMAE: ",round(svr_mae,2))
print("Decision Tree\nRMSE: ",round(tree_rmse,2), "\nMAE: ",round(tree_mae,2))
print("Random Forest\nRMSE: ",round(forest_rmse,2), "\nMAE: ",round(forest_mae,2))

Linear Regression
RMSE:  2.58 
MAE:  1.4
SVR
RMSE:  9.31 
MAE:  7.45
Decision Tree
RMSE:  1.25 
MAE:  0.19
Random Forest
RMSE:  1.0 
MAE:  0.15


<b>As you can see, Decision Tree and Random Forest models perform very well. I will now focus on tuning the random forest model for even better results.</b>

<h1 style="text-align:center">Model tuning</h1>

In [22]:
from pprint import pprint # show parameters used by our current model

In [30]:
# parameters currently used in my random forest model
pprint(forest_reg_model.get_params())

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [27]:
from sklearn.model_selection import RandomizedSearchCV

<p><b>I set ranges for several main parameters:</b></p></br>
<ul>
    <li><b>n_estimators</b> - number of trees in forest</li>
    <li><b>max_features</b> - number of features to consider at every split</li>
    <li><b>max_depth</b> - maximum number of levels in tree</li>
    <li><b>min_samples_split</b> - minimum number of samples required to split a node</li>
    <li><b>min_samples_leaf</b> - minimum number of samples required at each leaf node</li>
    <li><b>bootstrap</b> - method of selecting samples for training each tree</li>
</ul></br>
<p>The following settings give me 20 * 2 * 11 * 3 * 3 * 2 = 7920 settings. But the benefit of a random search is that I am not trying every combination, but select at random to sample a wide range of values. In my case, I will try 300 combinations from 7920.</p>

In [31]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 20)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

In [32]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [34]:
rf_random = RandomizedSearchCV(estimator=forest_reg_model, 
                               param_distributions=random_grid,
                               n_iter = 100, 
                               scoring='neg_mean_absolute_error', 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs=-1,
                               return_train_score=True)

In [35]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 17.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='neg_mean_absolute_error',
     

In [36]:
rf_random.best_params_

{'n_estimators': 170,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': True}

<p style="text-align:center"><b>To determine if random search yielded a better model, I compare my base model with the best random search model.</b></p>

In [43]:
best_model = rf_random.best_estimator_
best_predictions = best_model.predict(X_test)
best_rmse = np.sqrt(mean_squared_error(y_test, best_predictions))
best_mae = mean_absolute_error(y_test, best_predictions)

In [44]:
print("RMSE: ",best_rmse, "\nMAE: ",best_mae)

RMSE:  0.9150648251836808 
MAE:  0.14243709355601536


<h3 style="text-align:center;">As you can see, tuned model achieve better results</h3></br>
<p><b>BEFORE:</b></p></br>
<p><b>RMSE:</b> 1.0  <b>MAE:</b> 0.15</p></br>
<p><b>AFTER:</b></p></br>
<p><b>RMSE:</b> 0.92 <b>MAE:</b> 0.14</p>