In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## 1. Dataset Loading and Pre-processing

In [None]:
DATASET_PATH = "./male_players.csv"
dataset_df = pd.read_csv(DATASET_PATH)
dataset_df.head()

Lets have a look at the columns we have to work with

In [None]:
dataset_df.columns.to_list()

Taking a mean of the related attributes so as to form a summary of each attribute

In [None]:
dataset_df['attacking_mean'] = np.mean(dataset_df[['attacking_crossing',
                                       'attacking_finishing',
                                       'attacking_heading_accuracy',
                                       'attacking_short_passing',
                                       'attacking_volleys']], axis=1)

dataset_df['skill_mean'] = np.mean(dataset_df[['skill_dribbling',
                                   'skill_curve',
                                   'skill_fk_accuracy',
                                   'skill_long_passing',
                                   'skill_ball_control']], axis=1)

dataset_df['movement_mean'] = np.mean(dataset_df[['movement_acceleration',
                                      'movement_sprint_speed',
                                      'movement_agility',
                                      'movement_reactions',
                                      'movement_balance']], axis=1)

dataset_df['power_mean'] = np.mean(dataset_df[['power_shot_power',
                                   'power_jumping',
                                   'power_stamina',
                                   'power_strength',
                                   'power_long_shots']], axis=1)

dataset_df['mentality_mean'] = np.mean(dataset_df[['mentality_aggression',
                                       'mentality_interceptions',
                                       'mentality_positioning',
                                       'mentality_vision',
                                       'mentality_penalties',
                                       'mentality_composure']], axis=1)

dataset_df['defending_mean'] = np.mean(dataset_df[['defending_marking_awareness',
                                       'defending_standing_tackle',
                                       'defending_sliding_tackle']], axis=1)

dataset_df['goalkeeping_mean'] = np.mean(dataset_df[['goalkeeping_diving',
                                         'goalkeeping_handling',
                                         'goalkeeping_kicking',
                                         'goalkeeping_positioning',
                                         'goalkeeping_reflexes',
                                         'goalkeeping_speed']], axis=1)

columns_to_remove = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy',
                      'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve',
                      'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
                      'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
                      'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping',
                      'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression',
                      'mentality_interceptions', 'mentality_positioning', 'mentality_vision',
                      'mentality_penalties', 'mentality_composure', 'defending_marking_awareness',
                      'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving',
                      'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning',
                      'goalkeeping_reflexes', 'goalkeeping_speed']

# Remove the original columns
dataset_df.drop(columns=columns_to_remove, inplace=True)

# Display the modified DataFrame
dataset_df.head()


We use label encoding to encode categorical variables which might deicde the market value of a player

In [None]:
dataset_df['preferred_foot'] = LabelEncoder().fit_transform(dataset_df['preferred_foot'])
dataset_df['work_rate'] = LabelEncoder().fit_transform(dataset_df['work_rate'])
dataset_df['body_type'] = LabelEncoder().fit_transform(dataset_df['body_type'])

Let's have a look at the values of the changed attrbiutes

In [None]:
dataset_df[['preferred_foot', 'work_rate', 'body_type']]

Let's now visualize the how the attributes might be correlated with our dependent variable **value_eur** with the independent variables

In [None]:
selected_column = 'value_eur'

correlations= dataset_df.corrwith(dataset_df[selected_column])


plt.figure(figsize=(20, 10))
sns.barplot(x=correlations.index, y=correlations.values, palette='viridis')
plt.xticks(rotation=45, ha="right")

# Set plot labels and title
plt.xlabel('Columns')
plt.ylabel('Correlation Coefficient')
plt.title(f'Correlation of Column {selected_column} with Other Columns')

Let's remove the uncessary columns above a particular threshold

In [None]:
def keepOnlyDataOverAThreshold(data, selected_column, threshold):
    correlations = dataset_df.corrwith(dataset_df[selected_column])
    columns_to_keep = correlations[correlations.abs()>threshold].index.to_list()
    columns_to_delete = list(set(dataset_df.columns.to_list()) - set(columns_to_keep))
    return data.drop(columns=columns_to_delete)

In [None]:
dataset_df = keepOnlyDataOverAThreshold(dataset_df, 'value_eur', 0.1)
dataset_df.head()

In [None]:
dataset_df.shape

Let's remove all the rows which have N/A values

In [None]:
dataset_df = dataset_df.dropna()

In [None]:
dataset_df.shape

Let's scale the dataset

In [None]:
from sklearn.preprocessing import StandardScaler

columns = dataset_df.columns.to_list()

scaler = StandardScaler()
dataset_df[columns] = scaler.fit_transform(dataset_df[columns])

dataset_df.head()

## 2. Training Various Models

First let's perform the train test split

In [None]:
y = dataset_df["value_eur"]
X = dataset_df.drop("value_eur", axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 2.1. Linear Regression

In [None]:
def linearReg(X_train, X_test, y_train, y_test):
  model = LinearRegression()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)

  print(f'Mean Squared Error: {mse}')
  print(f'Root Mean Squared Error: {rmse}')
  print(f'R-squared: {r2}')

In [None]:
linearReg(X_train, X_test, y_train, y_test)

#### 2.2. Stochastic Gradient Descent Regressor

In [None]:
def sgdRegressor(X_train, X_test, y_train, y_test, iterations):
  model = SGDRegressor(max_iter=iterations)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)

  print(f'Mean Squared Error: {mse}')
  print(f'Root Mean Squared Error: {rmse}')
  print(f'R-squared: {r2}')

In [None]:
sgdRegressor(X_train, X_test, y_train, y_test, 10000)

#### 2.3. Polynomial Regression

In [None]:
small_X = X
poly_features = PolynomialFeatures(degree=3, include_bias=False)\
                                            .fit_transform(small_X)

X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.2, random_state=42)

Linear Regression with polynomial features

In [None]:
linearReg(X_train, X_test, y_train, y_test)

Stochastic Regression with polynomial features

In [None]:
sgdRegressor(X_train, X_test, y_train, y_test)

### 2.4. Decision Tree

Let's first re initalize the train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# create a regressor object 
regressor = DecisionTreeRegressor(random_state = 0)  
  
# fit the regressor with X and Y data 
regressor.fit(X_train, y_train) 

#test the regressor
y_reg = regressor.predict(X_test)

# evaluate model
var_y=y_test.var()
mse = mean_squared_error(y_test, y_reg)
nmse=mse/var_y
r2 = r2_score(y_test, y_reg)
print(f'MSE= {mse}, NMSE= {nmse}, R2= {r2}')

In [None]:
label_encoder = LabelEncoder()
x_categorical = dataset_df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = dataset_df.select_dtypes(exclude=['object']).values
x = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1).values

# Fitting Random Forest Regression to the dataset
regressor_rf = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)

# Fit the regressor with x and y data
regressor_rf.fit(X_train, y_train)

#test the regressor
y_reg_rf = regressor_rf.predict(X_test)

# evaluate model
var_y=y_test.var()
mse = mean_squared_error(y_test, y_reg_rf)
nmse=mse/var_y
r2 = r2_score(y_test, y_reg_rf)
print(f'MSE= {mse}, NMSE= {nmse}, R2= {r2}')

### 2.5. 