In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold,GridSearchCV
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Data Preparation and feature extraction



In [2]:
fifa = pd.read_csv('/content/drive/My Drive/Colab Notebooks/players_21.csv')


In [3]:
fifa22 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/players_22.csv')

  fifa22 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/players_22.csv')


In [None]:
fifa.head()

In [5]:
#Getting information on each column to see which ones to drop based on the number
#of items under each column
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 15.9+ MB


In [None]:
#List all the columns in the dataset
column_names = fifa.columns.tolist()
column_names

In [7]:
#Removing the unecessary columns
columns_to_drop=['long_name','player_url','dob','club_team_id',
                 'club_team_id','club_contract_valid_until',
                 'nationality_id','nation_position','release_clause_eur','player_tags',
                 'player_traits','player_face_url','club_logo_url',
                 'club_flag_url','nation_logo_url','nation_flag_url','nation_jersey_number',
                 'club_loaned_from','nation_team_id','goalkeeping_speed']

fifa.drop(columns=columns_to_drop, axis=1, inplace=True)

In [None]:
fifa.info()

In [9]:
corr_matrix = fifa.corr()

  corr_matrix = fifa.corr()


In [None]:
corr_matrix

In [None]:
#checking for the correlation between the rating and the other variables
corr_matrix['overall']

In [12]:
#Columns to drop after looking at the correlations with the overall rating
drop_columns = ['sofifa_id','height_cm','weak_foot', 'skill_moves','pace','attacking_finishing',
                'attacking_heading_accuracy','attacking_volleys','skill_dribbling',
                'skill_fk_accuracy','movement_acceleration','movement_sprint_speed',
                'movement_agility','movement_balance','power_jumping','power_stamina',
                'power_strength','mentality_interceptions','mentality_positioning',
                'mentality_penalties','defending_marking_awareness','defending_standing_tackle',
                'defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling',
                'goalkeeping_positioning','goalkeeping_reflexes','sofifa_id','league_level',
                'club_jersey_number','goalkeeping_kicking','club_name','league_name','club_joined','short_name',
                'ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm',
                'cdm','rdm','rwb','lb','lcb','cb','rcb','rb','player_positions','club_position','nationality_name','preferred_foot',
                'work_rate','body_type','real_face','gk','power_long_shots','mentality_aggression',
                'skill_ball_control','skill_long_passing','skill_curve','attacking_crossing','player_positions','club_position','nationality_name','preferred_foot',
                 'work_rate','body_type','real_face','gk','power_long_shots','mentality_aggression',
                 'skill_ball_control','skill_long_passing','skill_curve','attacking_crossing']

fifa.drop(columns=drop_columns, axis=1, inplace=True)

In [None]:
fifa.info()

In [14]:
#Identify the columns with missing values
columns_with_missing_values = fifa.columns[fifa.isnull().any()]

In [15]:
columns_with_missing_values

Index(['value_eur', 'wage_eur', 'shooting', 'passing', 'dribbling',
       'defending', 'physic'],
      dtype='object')

In [16]:
#impute the missing values
for column in columns_with_missing_values:
  if fifa[column].dtype == 'object':
    fifa[column].fillna(fifa[column].mode()[0], inplace=True)
  else:
    fifa[column].fillna(fifa[column].mean(),inplace=True)

In [None]:
fifa.info()

In [18]:
#X contains the independent varaibles and y is the dependent variable
dependent_variable = 'overall'
X = fifa.drop(dependent_variable,axis=1)
y = fifa[dependent_variable]

In [None]:
fifa.head()

In [None]:
#verify that the dataframe is updated
fifa.info()

#Feature Engineering


In [21]:
#Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
#Convert scaled features back to a dataframe
new_fifa = pd.DataFrame(X_scaled,columns=X.columns)

In [23]:
new_fifa

Unnamed: 0,potential,value_eur,wage_eur,age,weight_kg,international_reputation,shooting,passing,dribbling,defending,physic,attacking_short_passing,movement_reactions,power_shot_power,mentality_vision,mentality_composure
0,3.586563,13.073165,27.857178,1.655055,-0.427506,10.801035,3.009606,3.493625,3.433013,-0.860418,0.058838,2.216206,3.554438,2.119026,3.000047,3.137573
1,3.422893,7.809992,10.662997,2.080838,1.131240,10.801035,3.085367,2.461857,2.800094,-1.054260,1.363898,1.598583,3.664174,2.719163,2.052781,3.055051
2,3.259222,14.047827,11.674420,1.229273,0.706127,8.037314,2.933845,2.152326,2.378148,-0.537348,1.907673,1.735833,3.444701,2.344077,1.834181,2.477402
3,3.259222,16.776879,13.191553,0.590598,-0.994323,10.801035,2.479280,2.977741,3.327526,-0.989646,-0.593692,1.941707,3.225227,1.668922,2.635714,2.890009
4,3.259222,16.387015,18.248665,0.803490,-0.710915,8.037314,2.555040,3.699979,2.694607,0.819544,1.472653,2.422080,3.225227,2.494112,2.927180,2.724966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,-3.123941,-0.368070,-0.412078,-0.899642,-0.710915,-0.253847,-2.217897,-3.212867,-3.740069,-0.085051,-1.789997,-1.901280,-1.493455,-1.706851,-2.100615,-1.483625
18940,-2.960270,-0.368070,-0.412078,-0.899642,-0.994323,-0.253847,-1.536049,-0.839801,-1.630339,-0.537348,-1.789997,-0.254285,-1.273981,-1.481799,-0.060350,-1.896232
18941,-3.942295,-0.371319,-0.361507,0.590598,0.564423,-0.253847,-1.157245,-0.839801,-1.735826,-0.731190,-0.593692,-0.185661,-1.932402,-0.656610,-0.716150,-1.896232
18942,-0.668878,-0.360272,-0.437363,-1.751207,-2.411365,-0.253847,-0.475396,-1.768392,-0.997420,-2.281924,-2.660037,-0.940533,-0.944771,-0.581593,-0.351817,-1.071018


In [24]:
new_fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   potential                 18944 non-null  float64
 1   value_eur                 18944 non-null  float64
 2   wage_eur                  18944 non-null  float64
 3   age                       18944 non-null  float64
 4   weight_kg                 18944 non-null  float64
 5   international_reputation  18944 non-null  float64
 6   shooting                  18944 non-null  float64
 7   passing                   18944 non-null  float64
 8   dribbling                 18944 non-null  float64
 9   defending                 18944 non-null  float64
 10  physic                    18944 non-null  float64
 11  attacking_short_passing   18944 non-null  float64
 12  movement_reactions        18944 non-null  float64
 13  power_shot_power          18944 non-null  float64
 14  mental

#Training Models

In [25]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [26]:
X_train

array([[-1.15989095, -0.36417109, -0.43736346, ..., -1.70685085,
        -1.44481562, -3.0515316 ],
       [ 0.47681747, -0.33493124, -0.42472068, ..., -1.25674773,
         0.15824944, -0.49336849],
       [-0.66887843, -0.24721169, -0.10865118, ...,  0.99376783,
         0.88691538,  1.3221021 ],
       ...,
       [ 1.62251336,  0.07767555, -0.10865118, ...,  0.91875064,
         0.74118219,  0.4968882 ],
       [ 1.13150084,  1.70211173,  1.20619792, ...,  0.16857879,
         1.54271472,  0.90949515],
       [-1.97824516, -0.34727696, -0.36150678, ..., -0.28152432,
         0.15824944, -0.08076154]])

#GridSearch

In [30]:
# Step 3: Create a Parameter Grid
cv=KFold(n_splits=5)
rf_parameters = {
   'n_estimators': [200, 300, 400],
   'min_samples_split': [2, 5, 10],
    'max_depth': [None, 10, 20, 30]
}

xgb_parameters = {
   'n_estimators': [200, 300, 400],
   'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [None, 10, 20, 30]
}

gb_parameters = {
   'n_estimators': [200, 300, 400],
   'learning_rate': [0.01, 0.1, 0.2],
   'max_depth': [None, 10, 20, 30]
}

In [31]:
rf_regressor = RandomForestRegressor()
xgb_model =  xgb.XGBRegressor()
gradient_boost = GradientBoostingRegressor()

rf_grid_search = GridSearchCV(estimator=rf_regressor, param_grid=rf_parameters, scoring='neg_mean_squared_error', cv=5)
rf_grid_search.fit(X_train, y_train)

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_parameters, scoring='neg_mean_squared_error', cv=5)
xgb_grid_search.fit(X_train, y_train)

gb_grid_search = GridSearchCV(estimator=gradient_boost, param_grid=gb_parameters, scoring='neg_mean_squared_error', cv=5)
gb_grid_search.fit(X_train, y_train)

In [32]:
#Best parameters from the hypertuning process for RandomForestRegressor
rf_best_params = rf_grid_search.best_params_
rf_best_params

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 400}

In [33]:
#Best parameters from the hypertuning process for XGBoostingRegressor
xgb_best_params = xgb_grid_search.best_params_
xgb_best_params

{'learning_rate': 0.1, 'max_depth': None, 'n_estimators': 400}

In [34]:
#Best parameters from the hypertuning process for GradientBoostingRegressor
gb_best_params = gb_grid_search.best_params_
gb_best_params

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300}

#Cross Validation

In [37]:
rf_cv_scores = cross_val_score(rf_regressor,X_train,y_train,cv=cv,scoring='neg_mean_squared_error')
rf_rmse= np.sqrt(-rf_cv_scores.mean())
print(rf_rmse)

xgb_cv_scores = cross_val_score(xgb_model,X_train,y_train,cv=cv,scoring='neg_mean_squared_error')
xgb_rmse= np.sqrt(-xgb_cv_scores.mean())
print(xgb_rmse)

gb_cv_scores = cross_val_score(gradient_boost,X_train,y_train,cv=cv,scoring='neg_mean_squared_error')
gb_rmse= np.sqrt(-gb_cv_scores.mean())
print(gb_rmse)

0.5483116764124828
0.5523406125642079
0.8118976693311226


In [38]:
#Retraining models with best hyperarameters
rf_regressor.fit(X_train,y_train)

xgb_model.fit(X_train,y_train)

gradient_boost.fit(X_train,y_train)

In [39]:
final_rf_pred=np.round(rf_regressor.predict(X_test))

final_xgb_pred=np.round(xgb_model.predict(X_test))

final_gb_pred=np.round(gradient_boost.predict(X_test))

In [43]:
from sklearn.metrics import r2_score

rf_mae=mean_absolute_error(y_test,final_rf_pred)
rf_rmse=np.sqrt(mean_squared_error(y_test,final_rf_pred))
rf_r2=r2_score(y_test,final_rf_pred)
print(rf_mae)
print(rf_rmse)
print(rf_r2)

xgb_mae=mean_absolute_error(y_test,final_xgb_pred)
xgb_rmse=np.sqrt(mean_squared_error(y_test,final_xgb_pred))
xgb_r2=r2_score(y_test,final_xgb_pred)
print(xgb_mae)
print(xgb_rmse)
print(xgb_r2)

gb_mae=mean_absolute_error(y_test,final_gb_pred)
gb_rmse=np.sqrt(mean_squared_error(y_test,final_gb_pred))
gb_r2=r2_score(y_test,final_gb_pred)
print(gb_mae)
print(gb_rmse)
print(gb_r2)

0.2301398785959356
0.5729911147914787
0.993034496705326
0.24096067564001056
0.5444118571963732
0.9937120094855957
0.5143837424122459
0.8548682256896352
0.9844955959622568


In [44]:
import pickle

best_model = xgb_model
with open('best_model.pkl', 'wb') as file:
  pickle.dump(best_model, file)

with open('scaler.pkl', 'wb') as file:
  pickle.dump(scaler, file)



```
# This is formatted as code
```

#Testing the fifa_22 model with

In [None]:
#List all the columns in the dataset
column_names = fifa22.columns.tolist()
column_names

In [46]:
#Removing the unecessary columns
columns_to_drop22=['long_name','club_team_id','player_url','dob','club_team_id',
                 'club_team_id','club_contract_valid_until',
                 'nationality_id','nation_position','release_clause_eur','player_tags',
                 'player_traits','player_face_url','club_logo_url',
                 'club_flag_url','nation_logo_url','nation_flag_url','nation_jersey_number',
                 'club_loaned_from','nation_team_id','goalkeeping_speed']

fifa22.drop(columns=columns_to_drop22, axis=1, inplace=True)

In [47]:
#Columns to drop after looking at the correlations with the overall rating
drop_columns22 = ['sofifa_id','height_cm','weak_foot', 'skill_moves','pace','attacking_finishing',
                'attacking_heading_accuracy','attacking_volleys','skill_dribbling',
                'skill_fk_accuracy','movement_acceleration','movement_sprint_speed',
                'movement_agility','movement_balance','power_jumping','power_stamina',
                'power_strength','mentality_interceptions','mentality_positioning',
                'mentality_penalties','defending_marking_awareness','defending_standing_tackle',
                'defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling',
                'goalkeeping_positioning','goalkeeping_reflexes','sofifa_id','league_level',
                'club_jersey_number','goalkeeping_kicking','club_name','league_name','club_joined','short_name']

fifa22.drop(columns=drop_columns22, axis=1, inplace=True)

In [48]:
#Identify the columns with missing values
columns_with_missing_values22 = fifa22.columns[fifa22.isnull().any()]

In [49]:
columns_with_missing_values22

Index(['value_eur', 'wage_eur', 'club_position', 'shooting', 'passing',
       'dribbling', 'defending', 'physic'],
      dtype='object')

In [50]:
#impute the missing values
for column in columns_with_missing_values22:
  if fifa22[column].dtype == 'object':
    fifa22[column].fillna(fifa22[column].mode()[0], inplace=True)
  else:
    fifa22[column].fillna(fifa22[column].mean(),inplace=True)

In [51]:
#Identify the categorical variables
categorical22 = [col for col in fifa22.columns if fifa22[col].dtype == 'object']

In [52]:
#Encode categorical variables
label_encoder22 = LabelEncoder()
for col in categorical22:
  fifa22[col] = label_encoder22.fit_transform(fifa22[col])

In [None]:
#verify that the dataframe is updated
fifa22.info()

In [54]:
#X contains the independent varaibles and y is the dependent variable
dependent_variable = 'overall'
X22 = fifa22.drop(dependent_variable,axis=1)
y22 = fifa22[dependent_variable]

In [55]:
#Scale the data
scaler22 = StandardScaler()
scaled_features22 = scaler22.fit_transform(X22)

In [56]:
#Convert scaled features back to a dataframe
new_fifa22 = pd.DataFrame(scaled_features22, columns=X22.columns)

In [57]:
new_fifa22.insert(0, 'overall', y22)

In [None]:
new_fifa22

In [60]:
column_dropping22 = ['player_positions','club_position','nationality_name','preferred_foot',
                   'work_rate','body_type','real_face','gk','power_long_shots','mentality_aggression',
                   'skill_ball_control','skill_long_passing','skill_curve','attacking_crossing','ls','st',
                   'rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm',
                   'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']

new_fifa22.drop(columns=column_dropping22, axis=1, inplace=True)

In [None]:
new_fifa22.info()

In [62]:
X_2022 = new_fifa22.drop(columns=['overall'], axis = 1)
Y_2022 = new_fifa22['overall']

In [64]:
best_model = xgb_model

In [65]:
y_pred_2022 = best_model.predict(X_2022)
rmse_2022 = np.sqrt(mean_squared_error(y_pred_2022,Y_2022))
mae_2022 = mean_absolute_error(y_pred_2022,Y_2022)

print('RMSE for best model on 2022 data', rmse_2022)
print('MAE for best model on 2022 data',mae_2022)

results = pd.DataFrame({'Predicted': np.round(y_pred_2022), 'Actual': Y_2022})

results

RMSE for best model on 2022 data 0.6995317675406777
MAE for best model on 2022 data 0.5338447698096153


Unnamed: 0,Predicted,Actual
0,90.0,93
1,91.0,92
2,90.0,91
3,90.0,91
4,91.0,91
...,...,...
19234,47.0,47
19235,49.0,47
19236,49.0,47
19237,50.0,47
