In [1]:
# imports and setup

import numpy as np
import pandas as pd

# sklearn imports
from sklearn import linear_model
from sklearn import metrics

# helper for logistic regression explanations
from scipy.special import expit
from scipy.special import logit
from sklearn.preprocessing import MinMaxScaler


#Step 1 - Load the data

In [2]:
df=pd.read_csv("player_data_updated.csv")
df.drop('Unnamed: 0',axis='columns', inplace=True)

In [3]:
df.head()

Unnamed: 0,player name,age,citizenship,first_club,current_club,date_of_birth,place_of_birth,position,height,foot,transfer_count,current value-EUR,highest value-EUR
0,Joaquín,39,Spain,Betis B,Real Betis Balompié,"Jul 21, 1981",El Puerto de Santa María,attack - Right Winger,1.81,2,6,1500000,28000000
1,Jorge Molina,39,Spain,CD Alcoyano,Granada CF,"Apr 22, 1982",Alcoi,attack - Centre-Forward,1.88,2,8,1500000,3500000
2,Leonardo Burián,37,Uruguay,Nacional,Club Atlético Colón,"Jan 21, 1984",Melo,Goalkeeper,1.87,2,11,1100000,1100000
3,Denis Onyango,36,Uganda South Africa,Saint George SA,Mamelodi Sundowns FC,"May 15, 1985",Kampala,Goalkeeper,1.85,3,6,750000,1000000
4,Sebastián Tagliabúe,36,United Arab Emirates Argentina,Colegiales,Al-Nasr (Dubai),"Feb 22, 1985",Olivos,attack - Centre-Forward,1.81,2,8,800000,2000000


In [4]:
## Step 2 - identify the learning problem
##I used LinearRegression with R^2 index because the problem is a regression problem:
##Is it possible to predict how much a football player will cost in the future?

In [5]:
##Step 3 - Preparation of the the dataset's

In [6]:
X = df[['age','height','transfer_count','citizenship','first_club','current_club','position','foot','current value-EUR']]
y = df['highest value-EUR']

In [7]:
X.head()


Unnamed: 0,age,height,transfer_count,citizenship,first_club,current_club,position,foot,current value-EUR
0,39,1.81,6,Spain,Betis B,Real Betis Balompié,attack - Right Winger,2,1500000
1,39,1.88,8,Spain,CD Alcoyano,Granada CF,attack - Centre-Forward,2,1500000
2,37,1.87,11,Uruguay,Nacional,Club Atlético Colón,Goalkeeper,2,1100000
3,36,1.85,6,Uganda South Africa,Saint George SA,Mamelodi Sundowns FC,Goalkeeper,3,750000
4,36,1.81,8,United Arab Emirates Argentina,Colegiales,Al-Nasr (Dubai),attack - Centre-Forward,2,800000


In [8]:
y.head()

0    28000000
1     3500000
2     1100000
3     1000000
4     2000000
Name: highest value-EUR, dtype: int64

##Step 4 - Split the dataset into train & test sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train Size: ", X_train.shape[0])
print("Test Size: ", X_test.shape[0])
X_train.head()
y_train.head()

Train Size:  6570
Test Size:  1643


3550      700000
5744     2000000
3922      800000
1489      800000
3070    15000000
Name: highest value-EUR, dtype: int64

##Step 5 - Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
# select numeric features:
X_train_numeric = X_train._get_numeric_data().copy()
# remove latitude and longitude
X_numeric_cols = X_train_numeric.columns
X_test_numeric  = X_test[X_numeric_cols].copy()
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_numeric), columns=X_numeric_cols, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test_numeric), columns=X_numeric_cols, index=X_test.index)
X_train_scaled.head()

Unnamed: 0,age,height,transfer_count,foot,current value-EUR
3550,0.434783,0.489362,0.145833,0.666667,0.003012
5744,0.304348,0.638298,0.125,0.666667,0.016064
3922,0.391304,0.255319,0.0625,0.333333,0.002008
1489,0.521739,0.744681,0.083333,0.333333,0.002008
3070,0.434783,0.574468,0.0625,0.666667,0.146586


In [11]:
X_train_numeric.max().sort_values().head()

height               2.030000e+00
foot                 3.000000e+00
age                  3.900000e+01
transfer_count       4.800000e+01
current value-EUR    1.000000e+08
dtype: float64

##Step 6 - Encoding

In [12]:
X_discrete = X.select_dtypes('object').copy()
X_discrete_encoded = pd.get_dummies(X_discrete, prefix_sep="__")
X_train_discrete_encoded = X_discrete_encoded.loc[X_train.index,:]
X_test_discrete_encoded = X_discrete_encoded.loc[X_test.index,:]
X_train_discrete_encoded.head()

Unnamed: 0,citizenship__Albania,citizenship__Albania Belgium,citizenship__Albania England,citizenship__Albania Greece,citizenship__Albania Italy,citizenship__Albania Kosovo,citizenship__Albania North Macedonia,citizenship__Albania Switzerland,citizenship__Algeria,citizenship__Algeria Belgium,...,position__attack - Centre-Forward,position__attack - Left Winger,position__attack - Right Winger,position__attack - Second Striker,position__midfield,position__midfield - Attacking Midfield,position__midfield - Central Midfield,position__midfield - Defensive Midfield,position__midfield - Left Midfield,position__midfield - Right Midfield
3550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1489,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
##Step 7 - Merge the scaled and encoded features

In [14]:
X_train_processed = pd.concat((X_train_scaled, X_train_discrete_encoded), axis=1)
X_test_processed = pd.concat((X_test_scaled, X_test_discrete_encoded), axis=1)
X_train_processed.head()

Unnamed: 0,age,height,transfer_count,foot,current value-EUR,citizenship__Albania,citizenship__Albania Belgium,citizenship__Albania England,citizenship__Albania Greece,citizenship__Albania Italy,...,position__attack - Centre-Forward,position__attack - Left Winger,position__attack - Right Winger,position__attack - Second Striker,position__midfield,position__midfield - Attacking Midfield,position__midfield - Central Midfield,position__midfield - Defensive Midfield,position__midfield - Left Midfield,position__midfield - Right Midfield
3550,0.434783,0.489362,0.145833,0.666667,0.003012,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5744,0.304348,0.638298,0.125,0.666667,0.016064,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3922,0.391304,0.255319,0.0625,0.333333,0.002008,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1489,0.521739,0.744681,0.083333,0.333333,0.002008,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3070,0.434783,0.574468,0.0625,0.666667,0.146586,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


##Step 8 - Dimensionality Reduction PCA

In [15]:
from sklearn.decomposition import PCA
print("Num Features Pre-PCA:", X_train_processed.shape[1])
pca = PCA(0.9)
X_train_reduced = pca.fit_transform(X_train_processed)
X_test_reduced  = pca.transform(X_test_processed)

print("Num Features Post-PCA:", X_train_reduced.shape[1])


Num Features Pre-PCA: 5205
Num Features Post-PCA: 1717


In [16]:
X_train_reduced

array([[-2.01684716e-01, -5.13484827e-01,  7.50852279e-01, ...,
         5.79360326e-03,  3.88069939e-03,  5.07724830e-03],
       [-4.91116281e-02, -7.64037866e-02, -1.39495559e-01, ...,
         1.36613050e-02, -1.89720339e-03, -1.85874018e-02],
       [-1.41963321e-01, -1.97120054e-01, -2.69633211e-01, ...,
        -3.86649552e-02,  4.11610706e-02, -1.94872888e-02],
       ...,
       [-1.17082825e-01, -1.68294571e-01, -2.28225525e-01, ...,
         3.82631456e-03,  8.39077087e-03, -7.59462498e-03],
       [-4.43574333e-01,  7.62966831e-01,  1.96702878e-01, ...,
        -1.96885555e-02,  1.95401492e-02, -1.25798334e-02],
       [-8.99742023e-02, -1.39005227e-01, -2.08058846e-01, ...,
         7.96012376e-04,  2.12375289e-03,  3.96116462e-04]])

##Step 9 - Train Our Model

In [17]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_reduced, y_train)

LinearRegression()

##Step 10 - Measure the performance

In [18]:
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, r2_score
def describe_output_for_model(model, X_test, y_test, model_type):
    print("Predicting...")
    print("\tdisplaying information re: the '%s' model ...\n" %( model_type))
    y_pred = model.predict(X_test)
    if model_type == "classification":
        print("Model Accuracy: ", model.score(X_test, y_test))
        conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
        print(conf_mat)
        
        print("\nClassification Report: \n")
        print(classification_report(y_true=y_test, y_pred=y_pred))
    else:
        print("MAE: ", mean_absolute_error(y_true=y_test, y_pred=y_pred))
        print("R^2 : ", r2_score(y_test, y_pred))
    
        
        
    # Coefficients
    if model_type == "classification":
        coef = list(sorted(zip(np.arange(X_test.shape[1]), abs(model.coef_[0])),key=lambda x: x[1]))
        coef_table = pd.DataFrame(np.array(coef).reshape(-1,2),columns=['Attributes','Coefficient'])
        print('\nCoefficient for every feature:\n')
        print(coef_table)
    else:
        coef = list(sorted(zip(np.arange(X_test.shape[1]), abs(model.coef_)),key=lambda x: x[1]))
        coef_table = pd.DataFrame(np.array(coef).reshape(-1,2),columns=['Attributes','Coefficient'])
        print('\nCoefficient for every feature:\n')
        print(coef_table)

describe_output_for_model(model, X_test_reduced, y_test, "regression")

Predicting...
	displaying information re: the 'regression' model ...

MAE:  1754025.1878160443
R^2 :  0.939562357353926

Coefficient for every feature:

      Attributes   Coefficient
0         1277.0  1.313232e+03
1          817.0  1.438696e+03
2         1144.0  1.525927e+03
3          310.0  3.507221e+03
4         1194.0  5.241851e+03
...          ...           ...
1712        47.0  2.444195e+07
1713        38.0  4.100279e+07
1714        46.0  4.309042e+07
1715        45.0  4.484021e+07
1716        44.0  5.517137e+07

[1717 rows x 2 columns]
