### Importation

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import joblib
import pickle as pkl

### Data prep and extraction

In [2]:
#loading the datasets
stats = pd.read_csv("male_players (legacy).csv")

  stats = pd.read_csv("male_players (legacy).csv")


In [3]:
stats.head()

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/150002,15,2,2014-09-18,L. Messi,Lionel Andrés Messi Cuccittini,CF,93,95,...,62+3,62+3,62+3,54+3,45+3,45+3,45+3,54+3,15+3,https://cdn.sofifa.net/players/158/023/15_120.png
1,20801,/player/20801/c-ronaldo-dos-santos-aveiro/150002,15,2,2014-09-18,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"LW, LM",92,92,...,63+3,63+3,63+3,57+3,52+3,52+3,52+3,57+3,16+3,https://cdn.sofifa.net/players/020/801/15_120.png
2,9014,/player/9014/arjen-robben/150002,15,2,2014-09-18,A. Robben,Arjen Robben,"RM, LM, RW",90,90,...,64+3,64+3,64+3,55+3,46+3,46+3,46+3,55+3,14+3,https://cdn.sofifa.net/players/009/014/15_120.png
3,41236,/player/41236/zlatan-ibrahimovic/150002,15,2,2014-09-18,Z. Ibrahimović,Zlatan Ibrahimović,ST,90,90,...,65+3,65+3,61+3,56+3,55+3,55+3,55+3,56+3,17+3,https://cdn.sofifa.net/players/041/236/15_120.png
4,167495,/player/167495/manuel-neuer/150002,15,2,2014-09-18,M. Neuer,Manuel Peter Neuer,GK,90,90,...,40+3,40+3,36+3,36+3,38+3,38+3,38+3,36+3,87+3,https://cdn.sofifa.net/players/167/495/15_120.png


In [4]:
# dropping columns with more than 30% na values
threshold = 0.3 * stats.shape[0]
stats = stats.dropna(thresh=threshold, axis=1)
stats.drop(columns=['value_eur', 'wage_eur', 'release_clause_eur', 'international_reputation'], inplace=True)
# dropping the wage columns because they are rather dependant on the overall rating

### Subsetting data into the most important feautures

In [5]:
# picking numeric values for the independent variables only because they are generally more important in football rating
nums = stats.select_dtypes(include=np.number)

In [6]:
# imputing missing values in the numeric data
imputer = SimpleImputer(strategy='mean')
imputed_numeric_data = pd.DataFrame(imputer.fit_transform(nums), columns=nums.columns, index=nums.index)

In [7]:
# Separate the data into dependent and independent sets
y = imputed_numeric_data['overall']
X = imputed_numeric_data.drop(columns=['overall'])

In [8]:
# picking the top 13 most important features
important = SelectKBest(f_regression, k=13)
X_selected = important.fit_transform(X, y)

selected_features = X.columns[important.get_support()]
stats[selected_features]

Unnamed: 0,potential,age,shooting,passing,dribbling,physic,attacking_short_passing,skill_long_passing,skill_ball_control,movement_reactions,power_shot_power,mentality_vision,mentality_composure
0,95,27,89.0,86.0,96.0,63.0,89,76,96,94,80,90,
1,92,29,93.0,81.0,91.0,79.0,82,72,92,90,94,81,
2,90,30,86.0,83.0,92.0,64.0,86,76,90,89,86,84,
3,90,32,91.0,81.0,86.0,86.0,84,76,90,85,93,83,
4,90,28,,,,,42,41,31,89,42,20,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,61,18,48.0,40.0,47.0,44.0,38,33,45,39,64,42,40.0
161579,58,19,25.0,29.0,34.0,57.0,30,25,32,42,33,31,35.0
161580,58,19,36.0,43.0,46.0,53.0,51,43,35,50,51,40,35.0
161581,70,17,50.0,36.0,46.0,42.0,40,26,43,45,42,44,43.0


In [9]:
# selected features
X = pd.DataFrame(X_selected, columns=selected_features, index=X.index)
column_names = X.columns.tolist()

In [10]:
# Print shape and column names
print("Shape of X:", X.shape)
print("Selected features:", column_names)

Shape of X: (161583, 13)
Selected features: ['potential', 'age', 'shooting', 'passing', 'dribbling', 'physic', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']


### Training the models

In [11]:
# loading the test data set
players_22 = pd.read_csv("players_22.csv")

  players_22 = pd.read_csv("players_22.csv")


In [12]:
# splitting the data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# models
models = {
    'GradientBoosting': GradientBoostingRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'LinearRegression': LinearRegression()
}

In [14]:
# setting parameters
params = {
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7]
    },
    'DecisionTree': {
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10]
    },
    'LinearRegression': {}
}

### Measuring performance and fine-tuning

In [15]:
# performing GridSearch
best_models = {}
for model_name in models:
    grid_search = GridSearchCV(models[model_name], params[model_name], cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

In [16]:
# evaluating models
for model_name in best_models:
    model = best_models[model_name]
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - RMSE: {rmse}, MAE: {mae}, R2: {r2}")

GradientBoosting - RMSE: 1.1304731261386702, MAE: 0.7747315221243628, R2: 0.974215530918132
DecisionTree - RMSE: 2.1772134348278573, MAE: 1.5565700786963894, R2: 0.9043599665815039
LinearRegression - RMSE: 2.0756367535648024, MAE: 1.6090615319106163, R2: 0.9130758580143429


In [17]:
# evaluating models
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name}:\n RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

GradientBoosting:
 RMSE: 1.1305, MAE: 0.7747, R2: 0.9742
DecisionTree:
 RMSE: 2.1772, MAE: 1.5566, R2: 0.9044
LinearRegression:
 RMSE: 2.0756, MAE: 1.6091, R2: 0.9131


### Testing with FIFA22 data

In [18]:
players_22_processed = players_22.select_dtypes(include=[np.number]).dropna(axis=1)
common_features = list(set(X.columns) & set(players_22_processed.columns))   # checking common features

print("Common features:", common_features)

Common features: ['skill_ball_control', 'mentality_vision', 'power_shot_power', 'potential', 'movement_reactions', 'mentality_composure', 'age', 'attacking_short_passing', 'skill_long_passing']


In [19]:
# subseting
X_common = X[common_features]
X_test_22 = players_22_processed[common_features]

# making sure the "overall" variable is present
if 'overall' in players_22_processed.columns:
    y_test_22 = players_22_processed['overall']
else:
    raise KeyError("The 'overall' column is not found in the new season data.")

In [20]:
# retraining the models
models = {
    'GradientBoosting': GradientBoostingRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'LinearRegression': LinearRegression()}
best_models = {}

for model_name, model in models.items():
    X_train, X_val, y_train, y_val = train_test_split(X_common, y, test_size=0.2, random_state=42)   # splitting the data
    
    model.fit(X_train, y_train)   # training the model
    val_score = model.score(X_val, y_val)   # evaluate on validation set
    print(f"{model_name} - Validation R2 Score: {val_score:.4f}")
    
    best_models[model_name] = model

GradientBoosting - Validation R2 Score: 0.9551
DecisionTree - Validation R2 Score: 0.9290
LinearRegression - Validation R2 Score: 0.8998


In [21]:
# evaluating each model on the new dataset
for model_name, model in best_models.items():
    y_pred_22 = model.predict(X_test_22)
    
    rmse_22 = np.sqrt(mean_squared_error(y_test_22, y_pred_22))
    mae_22 = mean_absolute_error(y_test_22, y_pred_22)
    r2_22 = r2_score(y_test_22, y_pred_22)
    
    print(f"{model_name} for player22 subset:")
    print(f"RMSE: {rmse_22:.4f}, MAE: {mae_22:.4f}, R2: {r2_22:.4f}")
    print()

GradientBoosting for player22 subset:
RMSE: 1.4422, MAE: 1.0219, R2: 0.9561

DecisionTree for player22 subset:
RMSE: 0.8241, MAE: 0.2316, R2: 0.9857

LinearRegression for player22 subset:
RMSE: 2.2118, MAE: 1.6933, R2: 0.8966



In [22]:
# ensemble model
ensemble_model = VotingRegressor([
    ('gb', best_models['GradientBoosting']),
    ('dt', best_models['DecisionTree']),
    ('lr', best_models['LinearRegression'])])

# fitting the ensemble model
ensemble_model.fit(X_train, y_train)

# predicting based on the test set
y_pred_ensemble = ensemble_model.predict(X_val)

In [23]:
# evaluating the ensemble model
rmse_ensemble = np.sqrt(mean_squared_error(y_val, y_pred_ensemble))
mae_ensemble = mean_absolute_error(y_val, y_pred_ensemble)
r2_ensemble = r2_score(y_val, y_pred_ensemble)

print("Ensemble Model Performance on Validation Set:")
print(f"RMSE: {rmse_ensemble:.4f}, MAE: {mae_ensemble:.4f}, R2: {r2_ensemble:.4f}")

Ensemble Model Performance on Validation Set:
RMSE: 1.5485, MAE: 1.1454, R2: 0.9516


In [24]:
# testing on the FIFA22 data
y_pred_22_ensemble = ensemble_model.predict(X_test_22)

rmse_22_ensemble = np.sqrt(mean_squared_error(y_test_22, y_pred_22_ensemble))
mae_22_ensemble = mean_absolute_error(y_test_22, y_pred_22_ensemble)
r2_22_ensemble = r2_score(y_test_22, y_pred_22_ensemble)

print("Ensemble Model Performance on FIFA 22 data:")
print(f"RMSE: {rmse_22_ensemble:.4f}, MAE: {mae_22_ensemble:.4f}, R2: {r2_22_ensemble:.4f}")

Ensemble Model Performance on FIFA 22 data:
RMSE: 1.2045, MAE: 0.8864, R2: 0.9693


### Saving the best model

In [25]:
joblib.dump(ensemble_model, 'fifa_rating_predictor.pkl')

['fifa_rating_predictor.pkl']

### Creating preprocessing and prediction functions

In [26]:
# preprocessing function
def preprocess(data):
    input_df = pd.DataFrame([data])
    for feature in common_features:
        if feature not in input_df.columns:
            input_df[feature] = 0
    input_df = input_df[common_features]
    return input_df

In [27]:
# prediction function
def predict(data):
    processed_input = preprocess(data)
    prediction = model.predict(processed_input)
    return prediction[0]

### Deploying the model to Streamlit