In [1]:
# Importing of necessary Python libraries and features to be used in the code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Loading of the datasets to be used for training**

```

```



In [5]:
training_data = pd.read_csv("/content/drive/MyDrive/male_players (legacy).csv" ,index_col = 0)

  training_data = pd.read_csv("/content/drive/MyDrive/male_players (legacy).csv" ,index_col = 0)


**This gives a brief description of what the data entails**

In [6]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161583 entries, 158023 to 271608
Columns: 109 entries, player_url to player_face_url
dtypes: float64(18), int64(44), object(47)
memory usage: 135.6+ MB


**Dropping unnecessary columns**

In [7]:
columns_to_drop = ['player_url','player_face_url','fifa_update_date','fifa_version']
training_data = training_data.drop(columns_to_drop, axis=1 )


:**Displaying the numerical and categorical features**

In [8]:
num_cols = training_data.select_dtypes(include=['int64', 'float64']).columns
cat_cols = training_data.select_dtypes(include=['object']).columns

**Dropping columns with more than 30% threshold missing values in the training dataset**

In [9]:
missing_percentage_train = training_data.isnull().sum() / len(training_data) * 100
columns_to_drop_training_data = missing_percentage_train[missing_percentage_train > 30].index
training_data.drop(columns_to_drop_training_data, axis=1, inplace=True)

In [10]:
columns_to_drop_training_data

Index(['club_loaned_from', 'nation_team_id', 'nation_position',
       'nation_jersey_number', 'release_clause_eur', 'player_tags',
       'player_traits', 'goalkeeping_speed'],
      dtype='object')

In [11]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161583 entries, 158023 to 271608
Data columns (total 97 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   fifa_update                     161583 non-null  int64  
 1   short_name                      161583 non-null  object 
 2   long_name                       161583 non-null  object 
 3   player_positions                161583 non-null  object 
 4   overall                         161583 non-null  int64  
 5   potential                       161583 non-null  int64  
 6   value_eur                       159530 non-null  float64
 7   wage_eur                        159822 non-null  float64
 8   age                             161583 non-null  int64  
 9   dob                             161583 non-null  object 
 10  height_cm                       161583 non-null  int64  
 11  weight_kg                       161583 non-null  int64  
 12  league_id       

**Separating training data into numeric and categorical features**

---



In [12]:
numeric_features_training_data = training_data.select_dtypes(include=['number'])
categorical_features_training_data = training_data.select_dtypes(exclude=['number'])

In [13]:
numeric_features_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161583 entries, 158023 to 271608
Data columns (total 57 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   fifa_update                     161583 non-null  int64  
 1   overall                         161583 non-null  int64  
 2   potential                       161583 non-null  int64  
 3   value_eur                       159530 non-null  float64
 4   wage_eur                        159822 non-null  float64
 5   age                             161583 non-null  int64  
 6   height_cm                       161583 non-null  int64  
 7   weight_kg                       161583 non-null  int64  
 8   league_id                       159810 non-null  float64
 9   league_level                    158866 non-null  float64
 10  club_team_id                    159810 non-null  float64
 11  club_jersey_number              159810 non-null  float64
 12  club_contract_va

In [14]:
categorical_features_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161583 entries, 158023 to 271608
Data columns (total 40 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   short_name        161583 non-null  object
 1   long_name         161583 non-null  object
 2   player_positions  161583 non-null  object
 3   dob               161583 non-null  object
 4   league_name       159810 non-null  object
 5   club_name         159810 non-null  object
 6   club_position     159810 non-null  object
 7   club_joined_date  150272 non-null  object
 8   nationality_name  161583 non-null  object
 9   preferred_foot    161583 non-null  object
 10  work_rate         161583 non-null  object
 11  body_type         161583 non-null  object
 12  real_face         161583 non-null  object
 13  ls                161583 non-null  object
 14  st                161583 non-null  object
 15  rs                161583 non-null  object
 16  lw                161583 non-null  obj

**Using SimpleImputer to impute missing values for both numeric and categorical features in the training data**

In [15]:
numeric_imputer = SimpleImputer(strategy='mean')
training_data[numeric_features_training_data.columns] = numeric_imputer.fit_transform(training_data[numeric_features_training_data.columns])

categorical_imputer = SimpleImputer(strategy='most_frequent')
training_data[categorical_features_training_data.columns] = categorical_imputer.fit_transform(training_data[categorical_features_training_data.columns])

**Encoding the categorical features in the columns with LabelEncoder**

In [17]:
label_encoder = LabelEncoder()
for column in categorical_features_training_data.columns:
    training_data[column] = training_data[column].astype(str)
    training_data[column] = label_encoder.fit_transform(training_data[column])

**Seperating the data into features(X) and target variable(y)**




In [18]:
X = training_data.drop(columns=['overall'])
y = training_data['overall']

**Performing correlation to check the features with high correlation to the target variable**

In [19]:
corr = X.corrwith(y)
sorted_corr = corr.abs().sort_values(ascending=False)
corr_list = list(zip(sorted_corr.index, sorted_corr.tolist()))

In [20]:
corr_df = pd.DataFrame(corr_list, columns=['Feature', 'Correlation'])

In [21]:
top_features = corr_df.head(7)['Feature'].tolist()
X_subset = X[top_features]
X_subset

Unnamed: 0_level_0,movement_reactions,potential,passing,wage_eur,mentality_composure,value_eur,dribbling
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
158023,94.0,95.0,86.000000,550000.0,57.816892,100500000.0,96.000000
20801,90.0,92.0,81.000000,375000.0,57.816892,79000000.0,91.000000
9014,89.0,90.0,83.000000,275000.0,57.816892,54500000.0,92.000000
41236,85.0,90.0,81.000000,275000.0,57.816892,52500000.0,86.000000
167495,89.0,90.0,56.938175,300000.0,57.816892,63500000.0,62.081872
...,...,...,...,...,...,...,...
269011,39.0,61.0,40.000000,700.0,40.000000,110000.0,47.000000
269019,42.0,58.0,29.000000,750.0,35.000000,110000.0,34.000000
271093,50.0,58.0,43.000000,500.0,35.000000,110000.0,46.000000
271555,45.0,70.0,36.000000,500.0,43.000000,150000.0,46.000000


**Scaling the data**

In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
X_subset_scaled = scaler.fit_transform(X_subset)

**Dividing the data into training and testing sets for cross-validation**

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Performing hyperparameter tuning/ evaluation of XGBoost Regressor model using Cross-Validation**

In [26]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(random_state=42)

#Specifying the number of splits for the K-fold cross-validation
no_of_splits = 3

params = {'n_estimators': [10, 20, 30],'learning_rate': [0.1, 0.01, 0.001]}
xgb_grid = GridSearchCV(xgb_model, params, cv = no_of_splits, scoring='neg_mean_absolute_error', n_jobs=-1)

xgb_grid.fit(X_train, y_train)
xgb_model = xgb_grid.best_estimator_

y_pred = xgb_grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print('Cross-Validation Mean Squared Error Scores for XGBoost:', mse)
print('Cross-Validation Mean Absolute Error Scores for XGBoost:', mae)
print('Cross-Validation R-squared Scores for XGBoost:', r2)

Cross-Validation Mean Squared Error Scores for XGBoost: 0.7732190990952504
Cross-Validation Mean Absolute Error Scores for XGBoost: 0.6390315325952097
Cross-Validation R-squared Scores for XGBoost: 0.9843994366634271


**Performing Hyperparameter tuning/ evaluation of RandomForestRegressor model using Cross-Validation** **bold text**

In [30]:
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(random_state=42)

#Specifying the number of splits for the K-fold cross validation
no_of_splits = 3


params = {'n_estimators': [10, 20, 30],'max_depth':[10,30,40]}
rfr_grid = GridSearchCV(rfr_model, params, cv = no_of_splits, scoring='neg_mean_absolute_error', n_jobs=-1)

rfr_grid.fit(X_train, y_train)
rfr_model = rfr_grid.best_estimator_

y_pred = rfr_grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print('Cross-Validation Mean Squared Error Scores for RandomForestRegressor:', mse)
print('Cross-Validation Mean Absolute Error Scores for RandomForestRegressor:', mae)
print('Cross-Validation R-squared Scores for RandomForestRegressor:', r2)

Cross-Validation Mean Squared Error Scores for RandomForestRegressor: 0.24943889641816144
Cross-Validation Mean Absolute Error Scores for RandomForestRegressor: 0.28179449568523623
Cross-Validation R-squared Scores for RandomForestRegressor: 0.9949672902457664


In [28]:
from sklearn.ensemble import GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(random_state=42)

#Specifying the number of splits for the K-fold cross validation
no_of_splits = 3


params = {'n_estimators': [10, 20, 30],'learning_rate': [0.1, 0.01, 0.001]}
gbr_grid = GridSearchCV(gbr_model, params, cv = no_of_splits, scoring='neg_mean_absolute_error', n_jobs=-1)

gbr_grid.fit(X_train, y_train)
gbr_model = gbr_grid.best_estimator_

y_pred = gbr_grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print('Cross-Validation Mean Squared Error Scores for GradientBoostRegressor:', mse)
print('Cross-Validation Mean Absolute Error Scores for GradientBoostRegressor:', mae)
print('Cross-Validation R-squared Scores for GradientBoostRegressor:', r2)

Cross-Validation Mean Squared Error Scores for GradientBoostRegressor: 2.095194130229662
Cross-Validation Mean Absolute Error Scores for GradientBoostRegressor: 1.05838765079411
Cross-Validation R-squared Scores for GradientBoostRegressor: 0.9577271063669922


In [32]:
import joblib

#Saving the trained model
joblib.dump(rfr_model, 'Player Ratings.pkl')

['Player Ratings.pkl']

In [33]:
#Saving the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

**Loading of the players_22-1 dataset**

In [34]:
testing_data = pd.read_csv("/content/drive/MyDrive/players_22-1.csv",)

  testing_data = pd.read_csv("/content/drive/MyDrive/players_22-1.csv",)


In [35]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 16.1+ MB


**Dropping unneccessary columns**

In [36]:
columns_to_drop = ['player_url','player_face_url','club_logo_url','club_flag_url','nation_logo_url','nation_flag_url']
testing_data = testing_data.drop(columns_to_drop, axis=1 )

In [37]:
#Dropping columns with more than 30% of their data missing
missing_percentage_test = testing_data.isnull().sum() / len(testing_data) * 100
columns_to_drop_testing_data = missing_percentage_test[missing_percentage_test > 30].index
testing_data.drop(columns_to_drop_testing_data, axis=1, inplace=True)

In [38]:
columns_to_drop_testing_data

Index(['club_loaned_from', 'nation_team_id', 'nation_position',
       'nation_jersey_number', 'player_tags', 'player_traits',
       'goalkeeping_speed'],
      dtype='object')

**Separating the testing dataset into numeric and categorical features**

---



In [42]:
numeric_features_testing_data = testing_data.select_dtypes(include=['number'])
categorical_features_testing_data = testing_data.select_dtypes(exclude=['number'])

**Imputing the missing features of the testing dataset.**

In [43]:
numeric_imputer = SimpleImputer(strategy='mean')
testing_data[numeric_features_testing_data.columns] = numeric_imputer.fit_transform(testing_data[numeric_features_testing_data.columns])

categorical_imputer = SimpleImputer(strategy='most_frequent')
testing_data[categorical_features_testing_data.columns] = categorical_imputer.fit_transform(testing_data[categorical_features_testing_data.columns])

In [44]:
#testing_data[categorical_features_testing_data.columns] = pd.DataFrame(training_data[categorical_features_training_data.columns], columns=training_data.columns)
#testing_data[categorical_features_testing_data.columns] = pd.DataFrame(training_data[categorical_features_training_data.columns],
                                                                     # columns=categorical_features_testing_data.columns)

testing_data = testing_data.reset_index(drop=True)
training_data = training_data.reset_index(drop=True)

# Now perform the column replacement
testing_data[categorical_features_testing_data.columns] = pd.DataFrame(training_data[categorical_features_training_data.columns],
                                                                      columns=categorical_features_testing_data.columns)


**Encoding the Categorical Features with LabelEncoder**

In [45]:
label_encoder = LabelEncoder()
for column in categorical_features_testing_data.columns:
    testing_data[column] = testing_data[column].astype(str)
    testing_data[column] = label_encoder.fit_transform(testing_data[column])


In [46]:
X_players22 = testing_data.drop(columns=['overall'])
y_players22 = testing_data['overall']

In [47]:
X_players22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 96 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    19239 non-null  float64
 1   short_name                   19239 non-null  int64  
 2   long_name                    19239 non-null  int64  
 3   player_positions             19239 non-null  int64  
 4   potential                    19239 non-null  float64
 5   value_eur                    19239 non-null  float64
 6   wage_eur                     19239 non-null  float64
 7   age                          19239 non-null  float64
 8   dob                          19239 non-null  int64  
 9   height_cm                    19239 non-null  float64
 10  weight_kg                    19239 non-null  float64
 11  club_team_id                 19239 non-null  float64
 12  club_name                    19239 non-null  int64  
 13  league_name     

In [48]:
fitted_feature_names = rfr_grid.feature_names_in_

# Check the current feature names in X_players22
current_feature_names = X_players22.columns

# Rename columns in X_players22 if necessary
X_players22 = X_players22.rename(columns={
    'sofifa_id': 'league_id',
    'club_joined': 'club_joined_date',
    'club_contract_valid_until': 'club_contract_valid_until_year',
    'release_clause_eur': 'fifa_update'
})

# Add missing columns with default values (e.g., NaN or a suitable default)
for feature in fitted_feature_names:
    if feature not in X_players22.columns:
        X_players22[feature] = pd.NA  # or an appropriate default value

# Reorder columns to match the fitted feature names
X_players22 = X_players22[fitted_feature_names]

# Ensure there are no extra columns
X_players22 = X_players22.loc[:, fitted_feature_names]

# Now, predict
y_pred_players22 = rfr_grid.predict(X_players22)


***The RandomForestRegressor model was chosen as the best model since compared to the other models, it had the lowest Mean Absolute Score. ***

**Saving the y_test and y_pred values in a CSV file**

In [49]:
data = {'y_test': y_players22, 'y_pred': y_pred_players22}
new_df = pd.DataFrame(data)
new_df.to_csv('y_test_and_y_pred.csv', index=False)

**Evaluating the model's performance on the player_22 dataset with metrics such as MSE, MAE and R squared scores**

In [50]:
mse_players22 = mean_squared_error(y_players22, y_pred_players22)
mae_players22 = mean_absolute_error(y_players22, y_pred_players22)
r2_players22 = r2_score(y_players22, y_pred_players22)
print('Mean Squared Error for Players_22 dataset:', mse_players22)
print('Mean Absolute Error for Players_22 dataset:', mae_players22)
print('R-squared score for Players_22 dataset:', r2_players22)


Mean Squared Error for Players_22 dataset: 12.26564146874013
Mean Absolute Error for Players_22 dataset: 2.5475706336721538
R-squared score for Players_22 dataset: 0.7408765383785201
