In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import pickle

In [2]:
path = 'merged_players_df_final.csv'
df = pd.read_csv(path)

In [3]:
# Checks the correlation of attributes on target attribute

for i in range(9):
    correlation_bound = i / 10
    
    big_columns = df.corr(numeric_only=True)['market_value_in_eur'][df.corr(numeric_only=True)['market_value_in_eur'] > correlation_bound]
    small_columns = df.corr(numeric_only=True)['market_value_in_eur'][df.corr(numeric_only=True)['market_value_in_eur'] < -correlation_bound]
    effective_columns = pd.concat([big_columns, small_columns])
    
    effective_columns = effective_columns.index.tolist()
    
    element_to_remove = 'market_value_in_eur'
    effective_columns = [elem for elem in effective_columns if elem != element_to_remove]
    element_to_remove = 'player_id'
    effective_columns = [elem for elem in effective_columns if elem != element_to_remove]

    merged_players_df1 = df.copy()
    y = merged_players_df1['market_value_in_eur']
    X = merged_players_df1[effective_columns].fillna(-1000)
    
    #split test and training data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=31)
    
    #model data
    clf = GradientBoostingRegressor(random_state=0)
    clf.fit(X_train, y_train)
    
    #assess accurancy of model
    print("correlation bound: ", correlation_bound)
    print("Accuracy: {}".format(clf.score(X_test, y_test)))
    print(effective_columns)


correlation bound:  0.0
Accuracy: 0.901611715440991
['games_2022', 'minutes_played_2022', 'goals_2022', 'assists_2022', 'goals_against_2022', 'goals_for_2022', 'clean_sheet_2022', 'last_season', 'squad_size', 'highest_market_value_in_eur', 'yellow_cards_2022', 'red_cards_2022', 'current_club_domestic_competition_id_ES1', 'current_club_domestic_competition_id_FR1', 'current_club_domestic_competition_id_GB1', 'current_club_domestic_competition_id_IT1', 'current_club_domestic_competition_id_L1', 'position_Attack', 'position_Midfield', 'sub_position_Attacking Midfield', 'sub_position_Central Midfield', 'sub_position_Centre-Forward', 'sub_position_Defensive Midfield', 'sub_position_Left Winger', 'sub_position_Right Winger', 'sub_position_Second Striker', 'foot_left', 'foot_right', 'height_in_cm', 'age', 'current_club_domestic_competition_id_BE1', 'current_club_domestic_competition_id_DK1', 'current_club_domestic_competition_id_GR1', 'current_club_domestic_competition_id_NL1', 'current_club_

In [4]:
# Different type of models

#model_1_Accuracy: 0.901611715440991
model_1_columns = ['games_2022', 'minutes_played_2022', 'goals_2022', 'assists_2022', 'goals_against_2022', 'goals_for_2022', 'clean_sheet_2022', 'last_season', 'squad_size', 'highest_market_value_in_eur', 'yellow_cards_2022', 'red_cards_2022', 'current_club_domestic_competition_id_ES1', 'current_club_domestic_competition_id_FR1', 'current_club_domestic_competition_id_GB1', 'current_club_domestic_competition_id_IT1', 'current_club_domestic_competition_id_L1', 'position_Attack', 'position_Midfield', 'sub_position_Attacking Midfield', 'sub_position_Central Midfield', 'sub_position_Centre-Forward', 'sub_position_Defensive Midfield', 'sub_position_Left Winger', 'sub_position_Right Winger', 'sub_position_Second Striker', 'foot_left', 'foot_right', 'height_in_cm', 'age', 'current_club_domestic_competition_id_BE1', 'current_club_domestic_competition_id_DK1', 'current_club_domestic_competition_id_GR1', 'current_club_domestic_competition_id_NL1', 'current_club_domestic_competition_id_PO1', 'current_club_domestic_competition_id_RU1', 'current_club_domestic_competition_id_SC1', 'current_club_domestic_competition_id_TR1', 'current_club_domestic_competition_id_UKR1', 'position_Defender', 'position_Goalkeeper', 'position_Missing', 'sub_position_Centre-Back', 'sub_position_Goalkeeper', 'sub_position_Left Midfield', 'sub_position_Left-Back', 'sub_position_Right Midfield', 'sub_position_Right-Back', 'foot_both']

#model_2_Accuracy: 0.7902370536576423
model_2_columns = ['games_2022', 'minutes_played_2022', 'goals_2022', 'assists_2022', 'goals_against_2022', 'goals_for_2022', 'clean_sheet_2022', 'last_season', 'highest_market_value_in_eur', 'yellow_cards_2022', 'current_club_domestic_competition_id_GB1']


In [6]:
#MODEL 1 TRAIN
merged_players_df1_fillnan = merged_players_df1.copy()
merged_players_df1_fillnan = merged_players_df1_fillnan.fillna(merged_players_df1_fillnan.median(numeric_only=True))

y = merged_players_df1_fillnan['market_value_in_eur']
X = merged_players_df1_fillnan[model_1_columns]

models = [
    ("Gradient Boosting", GradientBoostingRegressor(random_state=0)),
    ("Random Forest", RandomForestRegressor(random_state=0)),
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge()),
    ("Lasso Regression", Lasso()),
    ("ElasticNet Regression", ElasticNet()),
    ("K-Nearest Neighbors", KNeighborsRegressor()),
    ("Decision Tree", DecisionTreeRegressor(random_state=0))
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=31)

for model_name, model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print("{} Accuracy: {}".format(model_name, accuracy))


model_1 = RandomForestRegressor(random_state=0)
model_1.fit(X_train, y_train)

print("Accuracy: {}".format(model_1.score(X_test, y_test)))
print(model_1_columns)

Gradient Boosting Accuracy: 0.9012682781957022
Random Forest Accuracy: 0.9058636864564906
Linear Regression Accuracy: 0.7437248517828065
Ridge Regression Accuracy: 0.743730553396285


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(


Lasso Regression Accuracy: 0.7437250373754671
ElasticNet Regression Accuracy: 0.7419419440397872
K-Nearest Neighbors Accuracy: 0.7365986295439977
Decision Tree Accuracy: 0.8073655781118528
Accuracy: 0.9058636864564906
['games_2022', 'minutes_played_2022', 'goals_2022', 'assists_2022', 'goals_against_2022', 'goals_for_2022', 'clean_sheet_2022', 'last_season', 'squad_size', 'highest_market_value_in_eur', 'yellow_cards_2022', 'red_cards_2022', 'current_club_domestic_competition_id_ES1', 'current_club_domestic_competition_id_FR1', 'current_club_domestic_competition_id_GB1', 'current_club_domestic_competition_id_IT1', 'current_club_domestic_competition_id_L1', 'position_Attack', 'position_Midfield', 'sub_position_Attacking Midfield', 'sub_position_Central Midfield', 'sub_position_Centre-Forward', 'sub_position_Defensive Midfield', 'sub_position_Left Winger', 'sub_position_Right Winger', 'sub_position_Second Striker', 'foot_left', 'foot_right', 'height_in_cm', 'age', 'current_club_domestic_c

In [7]:
y = merged_players_df1_fillnan['market_value_in_eur']
X = merged_players_df1_fillnan[model_1_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=31)

# Export Model
model_1.fit(X_train, y_train)

with open('model_complicated.pkl', 'wb') as file:
    pickle.dump(model_1, file)

In [9]:
#MODEL 2 TRAIN
merged_players_df1_fillnan = merged_players_df1.copy()
merged_players_df1_fillnan = merged_players_df1_fillnan.fillna(merged_players_df1_fillnan.median(numeric_only=True))

y = merged_players_df1_fillnan['market_value_in_eur']
X = merged_players_df1_fillnan[model_2_columns]

models = [
    ("Gradient Boosting", GradientBoostingRegressor(random_state=0)),
    ("Random Forest", RandomForestRegressor(random_state=0)),
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge()),
    ("Lasso Regression", Lasso()),
    ("ElasticNet Regression", ElasticNet()),
    ("K-Nearest Neighbors", KNeighborsRegressor()),
    ("Decision Tree", DecisionTreeRegressor(random_state=0))
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=31)

for model_name, model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print("{} Accuracy: {}".format(model_name, accuracy))


model_2 = GradientBoostingRegressor(random_state=0)
model_2.fit(X_train, y_train)

print("Accuracy: {}".format(model_2.score(X_test, y_test)))
print(model_2_columns)

Gradient Boosting Accuracy: 0.7902370536576423
Random Forest Accuracy: 0.7800739866279498
Linear Regression Accuracy: 0.715557898961269
Ridge Regression Accuracy: 0.7155581707629728
Lasso Regression Accuracy: 0.7155579168016015
ElasticNet Regression Accuracy: 0.7126957699395704
K-Nearest Neighbors Accuracy: 0.731291721215008
Decision Tree Accuracy: 0.6459253797930414
Accuracy: 0.7902370536576423
['games_2022', 'minutes_played_2022', 'goals_2022', 'assists_2022', 'goals_against_2022', 'goals_for_2022', 'clean_sheet_2022', 'last_season', 'highest_market_value_in_eur', 'yellow_cards_2022', 'current_club_domestic_competition_id_GB1']


In [10]:
y = merged_players_df1_fillnan['market_value_in_eur']
X = merged_players_df1_fillnan[model_2_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=31)

# Export Model
model_2.fit(X_train, y_train)

with open('model_vanilla.pkl', 'wb') as file:
    pickle.dump(model_2, file)