In [2]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [32]:
df = pd.read_csv('/content/drive/My Drive/Intro to AI/Midsem Project/players_21.csv')

# ***1. Data processing & feature extracting***

Before training the models, we cleanned the data in order to make it usable for the training processes. This process involved picking the parts of the dataset that would be most relevant to our purpose.



***Dropping columns with too many null values***

> We deleted the columns with a high number of null values, because keeping them would create inconsistency in the dataset, which would reduce the accuracy of the predictions.

>Our threshold was 30% null values, which we belive is adequate for a dataset of this size.


In [33]:
# Listing the columns with a high number of null values (over 30%)
column_names = list(df.columns.values)
nulls = []

for c in column_names:
  percentage_null = (df[c].isnull().sum() / len(df[c])) * 100

  if percentage_null >= 30.0:
    nulls.append(c)
    print(f"Null values in '{c}' column : {percentage_null:.2f}%")


Null values in 'club_loaned_from' column : 96.00%
Null values in 'nation_team_id' column : 94.05%
Null values in 'nation_position' column : 94.05%
Null values in 'nation_jersey_number' column : 94.05%
Null values in 'player_tags' column : 92.57%
Null values in 'player_traits' column : 56.11%
Null values in 'goalkeeping_speed' column : 89.00%
Null values in 'nation_logo_url' column : 94.05%


In [34]:
# Dropping the columns with a lot of null values
data = df.drop(nulls, axis=1)


***Processing the data***
>We make sure that all the data is numbers by encoding all data elements that are not, to make all the data usable by the models

>We used the factorize() method of the pandas library to do that.

In [35]:
# Imputing & transforming the data

# ** Isolating numerical & categorical data
numerical = data.select_dtypes(exclude=['object'])
categorical = data.select_dtypes(include=['object'])

# ** Factorizing categorical values (turning them into numbers)
cat_columns = list(categorical.columns.values)
for col in cat_columns:
  categorical[col], b = pd.factorize(categorical[col])

# ** Filling blanks in the numerical columns (imputing)
numerical.fillna(numerical.mean(), inplace =True)


In [36]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

In [37]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 45 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   player_url        18944 non-null  int64
 1   short_name        18944 non-null  int64
 2   long_name         18944 non-null  int64
 3   player_positions  18944 non-null  int64
 4   dob               18944 non-null  int64
 5   club_name         18944 non-null  int64
 6   league_name       18944 non-null  int64
 7   club_position     18944 non-null  int64
 8   club_joined       18944 non-null  int64
 9   nationality_name  18944 non-null  int64
 10  preferred_foot    18944 non-null  int64
 11  work_rate         18944 non-null  int64
 12  body_type         18944 non-null  int64
 13  real_face         18944 non-null  int64
 14  ls                18944 non-null  int64
 15  st                18944 non-null  int64
 16  rs                18944 non-null  int64
 17  lw                18944 non-nul

In [38]:
# Putting the new dataset together
categorical = categorical.reset_index(drop=True)
numerical = numerical.reset_index(drop=True)

data = pd.concat([categorical, numerical], axis=1)


In [39]:
# Picking the appropriate attributes for the model (1/2)
X = data.drop('overall', axis=1)
y = data['overall']

X.drop([
        'dob', 'short_name', 'long_name', 'club_name', 'club_joined', 'player_url',
        'player_face_url', 'club_logo_url','wage_eur', 'sofifa_id', 'international_reputation'
       ],
       axis=1, inplace = True)


In [40]:
# Importing the models
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [41]:
# Fitting the models
lin_reg = LinearRegression()
lin_reg.fit(X, y)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_reg.fit(X, y)

#kmn_reg = KernelRidge(kernel='rbf')
#kmn_reg.fit(X, y)

xgb_reg = XGBRegressor(n_estimators=100, n_jobs=-1)
xgb_reg.fit(X, y)

gb_reg = GradientBoostingRegressor(n_estimators=100)
gb_reg.fit(X, y)

bagging_reg = BaggingRegressor(estimator=LinearRegression(), n_estimators=100, n_jobs=-1)
bagging_reg.fit(X, y)


In [42]:
# Feature importance for RFRegressor
feature_importances = rf_reg.feature_importances_
sorted_feature_importances = np.argsort(feature_importances)[::-1]

# Print the most important features
for i in sorted_feature_importances[:15]:
  print(X.columns[i])


value_eur
release_clause_eur
age
potential
movement_reactions
gk
defending
club_flag_url
mentality_interceptions
league_name
goalkeeping_positioning
lam
attacking_crossing
goalkeeping_reflexes
goalkeeping_diving


In [43]:
# Feature importance for GBRegressor

feature_importances = gb_reg.feature_importances_
gb_reg.feature_importances_
sorted_feature_importances = np.argsort(feature_importances)[::-1]

# Print the most important features
for i in sorted_feature_importances[:15]:
  print(X.columns[i])


value_eur
movement_reactions
release_clause_eur
age
potential
attacking_crossing
defending
physic
goalkeeping_positioning
passing
defending_sliding_tackle
club_flag_url
goalkeeping_handling
rwb
lm


In [44]:
# Feature importance for XGBRegressor
feature_importances = xgb_reg.feature_importances_
sorted_feature_importances = np.argsort(feature_importances)[::-1]

# Print the most important features
for i in sorted_feature_importances[:15]:
  print(X.columns[i])


value_eur
release_clause_eur
movement_reactions
age
potential
defending
physic
attacking_crossing
lm
lb
lam
goalkeeping_positioning
ldm
passing
goalkeeping_reflexes


In [45]:
# *** Identifying columns with high a correlation to the overall rating
corr_matrix = X.corrwith(y)
corr_matrix.sort_values(ascending=False)

low_corr = corr_matrix[abs(corr_matrix)<0.5].index.tolist()
high_corr = []
for c in X.columns.values:
  if c not in low_corr:
    high_corr.append(c)

high_corr


['potential',
 'value_eur',
 'release_clause_eur',
 'passing',
 'dribbling',
 'attacking_short_passing',
 'movement_reactions',
 'power_shot_power',
 'mentality_vision',
 'mentality_composure']

In [46]:
# Final selection of features for the prediction
pred_columns = [
                'potential', 'passing', 'dribbling', 'defending', 'physic',
                'movement_reactions', 'power_shot_power', 'mentality_composure',
                'value_eur', 'release_clause_eur'
                ]
X = data[pred_columns]


In [47]:
# Scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)


In [48]:
# Re-Fitting the models
lin_reg = LinearRegression()
lin_reg.fit(X, y)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_reg.fit(X, y)

#kmn_reg = KernelRidge(kernel='rbf')
#kmn_reg.fit(X, y)

xgb_reg = XGBRegressor(n_estimators=100, n_jobs=-1)
xgb_reg.fit(X, y)

gb_reg = GradientBoostingRegressor(n_estimators=100)
gb_reg.fit(X, y)

bagging_reg = BaggingRegressor(estimator=LinearRegression(), n_estimators=100, n_jobs=-1)
bagging_reg.fit(X, y)


In [49]:
# Importing metrics for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [50]:
# Importing metrics for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluating the accuracy of each model
print("Accuracy Scores for each model: \n")

y_pred = lin_reg.predict(X)
print(lin_reg.__class__.__name__ + ":")
print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
print("R-squared Test: " + str(r2_score(y_pred, y)) + '\n')

y_pred = rf_reg.predict(X)
print(rf_reg.__class__.__name__ + ":")
print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
print(rf_reg.__class__.__name__ + ": " + str(r2_score(y_pred, y)) + '\n')

#y_pred = kmn_reg.predict(X)
#print(kmn_reg.__class__.__name__ + ":")
#print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
#print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
#print("R-squared Test: " + str(r2_score(y_pred, y)))

y_pred = xgb_reg.predict(X)
print(xgb_reg.__class__.__name__ + ":")
print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
print("R-squared Test: " + str(r2_score(y_pred, y)) + '\n')

y_pred = gb_reg.predict(X)
print(gb_reg.__class__.__name__ + ":")
print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
print("R-squared Test: " + str(r2_score(y_pred, y)) + '\n')

y_pred = bagging_reg.predict(X)
print(bagging_reg.__class__.__name__ + ":")
print("Mean Absolute error: " + str(mean_absolute_error(y_pred, y)))
print("Mean squared error: " + str(mean_squared_error(y_pred, y)))
print("R-squared Test: " + str(r2_score(y_pred, y)) + '\n')


Accuracy Scores for each model: 

LinearRegression:
Mean Absolute error: 2.0003627965881443
Mean squared error: 6.571062047909092
R-squared Test: 0.8452347268824237

RandomForestRegressor:
Mean Absolute error: 0.20473975929054064
Mean squared error: 0.1258211307010135
RandomForestRegressor: 0.9973926095036525

XGBRegressor:
Mean Absolute error: 0.4853069123786849
Mean squared error: 0.4479333432709801
R-squared Test: 0.9907549165969389

GradientBoostingRegressor:
Mean Absolute error: 1.026409224098909
Mean squared error: 1.9134036869632396
R-squared Test: 0.9584548567430238

BaggingRegressor:
Mean Absolute error: 2.0003939149725025
Mean squared error: 6.571080827932227
R-squared Test: 0.8452383451594596



In [51]:
# Looking for the best number of estimators for the prefered models (RandomForest, GradientBoosting, and XGB regressors)
best_n_rf = 0
best_n_gb = 0
best_n_xgb = 0

best_score1 = 0
best_score2 = 0
best_score3 = 0



# *** For RandomForest ***
for n1 in range(150, 180):
  rf = RandomForestRegressor(n_estimators=n1, random_state=42, n_jobs=-1)
  rf.fit(X, y)
  pred = rf.predict(X)
  score = r2_score(pred, y)

  if score > best_score1:
    best_score1 = score
    best_n_rf = n1

print("Best n_estimator for RFRegressor: " + str(best_n_rf))
print("Score: " + str(best_score1) + "\n")



# *** For GradientBoosting ***
for n2 in range(150, 180):
  gb = GradientBoostingRegressor(n_estimators=n2)

  gb.fit(X, y)
  pred = gb.predict(X)
  score = r2_score(pred, y)

  if score > best_score2:
    best_score2 = score
    best_n_gb = n2

print("Best n_estimator for GBRegressor: " + str(best_n_gb))
print("Score: " + str(best_score2) + "\n")



# *** For XGB ***
for n3 in range(150, 180):
  xgb = XGBRegressor(n_estimators=n3, n_jobs=-1)

  xgb.fit(X, y)
  pred = xgb.predict(X)
  score = r2_score(pred, y)

  if score > best_score3:
    best_score3 = score
    best_n_xgb = n3

print("Best n_estimator for XGBRegressor: " + str(best_n_xgb))
print("Score: " + str(best_score3) + "\n")


Best n_estimator for RFRegressor: 177
Score: 0.9974515978358872

Best n_estimator for GBRegressor: 179
Score: 0.9684994658597459

Best n_estimator for XGBRegressor: 179
Score: 0.9941120473951067



In [53]:
# Cross validation Training - RandomForest model
from sklearn.model_selection import KFold


kfold = KFold(n_splits=5, shuffle=True)
rf_reg = RandomForestRegressor(n_estimators=best_n_rf, random_state=42, n_jobs=-1)
rf_mae_scores = []
rf_r2_scores = []


for train_index, test_index in kfold.split(X):

  Xtrain, Xtest = X[train_index], X[test_index]
  ytrain, ytest = y[train_index], y[test_index]

  rf_reg.fit(Xtrain, ytrain)
  y_pred = rf_reg.predict(Xtest)

  mae_scor = mean_absolute_error(y_pred, ytest)
  r2_scor = r2_score(y_pred, ytest)

  rf_mae_scores.append(mae_scor)
  rf_r2_scores.append(r2_scor)


avg_mae = np.mean(rf_mae_scores)
avg_r2 = np.mean(rf_r2_scores)

print('Average Cross Validation scores:')
print("MAE: " + str(avg_mae))
print("R2: " + str(avg_r2))


Average Cross Validation scores:
MAE: 0.5671469347662892
R2: 0.9800164715122828


In [54]:
# Cross validation testing - Voting Regressor Model
from sklearn.ensemble import VotingRegressor


models = [
          ('rf', RandomForestRegressor(n_estimators=best_n_rf, random_state=42)),
          ('xgb', XGBRegressor(n_estimators=best_n_xgb)),
          ('gbr', GradientBoostingRegressor(n_estimators=best_n_gb))
         ]

voting_reg = VotingRegressor(estimators=models, n_jobs=-1)
voting_mae_scores = []
voting_r2_scores = []


for train_index, test_index in kfold.split(X):

  Xtrain, Xtest = X[train_index], X[test_index]
  ytrain, ytest = y[train_index], y[test_index]

  voting_reg.fit(Xtrain, ytrain)
  y_pred = voting_reg.predict(Xtest)

  mae_scor = mean_absolute_error(y_pred, ytest)
  r2_scor = r2_score(y_pred, ytest)

  voting_mae_scores.append(mae_scor)
  voting_r2_scores.append(r2_scor)


avg_mae = np.mean(voting_mae_scores)
avg_r2 = np.mean(voting_r2_scores)

print('Average Cross Validation scores:')
print("MAE: " + str(avg_mae))
print("R2: " + str(avg_r2))


Average Cross Validation scores:
MAE: 0.641075796809699
R2: 0.9803705111687269


In [55]:
# Loading and treating the FIFA22 dataset
df2 = pd.read_csv('/content/drive/My Drive/Intro to AI/Midsem Project/players_22.csv')

# Data Selection
data2 = df2[pred_columns]
num = data2.select_dtypes(exclude=['object'])
cat = data2.select_dtypes(include=['object'])

# Data processing
cat_cols = list(cat.columns.values)
for col in cat_cols:
  cat[col], b = pd.factorize(cat[col])

num.fillna(num.mean(), inplace =True)

# New dataset
cat = cat.reset_index(drop=True)
num = num.reset_index(drop=True)
data2 = pd.concat([cat, num], axis=1)

# Prediction variables
X = scaler.fit_transform(data2)
y = df2['overall']


  df2 = pd.read_csv('/content/drive/My Drive/Intro to AI/Midsem Project/players_22.csv')


In [58]:
# Accuracy of the model on the FIFA22 dataset
print("Accuracy Scores for the Random Forest model on the FIFA 22 dataset: ")
y_pred = rf_reg.predict(X)
print("R-squared test: " + str(r2_score(y_pred, y)))
print("Mean Absolute Error: " + str(mean_absolute_error(y_pred, y)))


Accuracy Scores for the Random Forest model on the FIFA 22 dataset: 
R-squared test: 0.8444030967093235
Mean Absolute Error: 1.3444571599061814


In [59]:
# Accuracy of the model on the FIFA22 dataset
print("Accuracy Scores for the Voting Regressor model on the FIFA 22 dataset: ")
y_pred = voting_reg.predict(X)
print("R-squared test: " + ": " + str(r2_score(y_pred, y)))
print("Mean Absolute Error: " + ": " + str(mean_absolute_error(y_pred, y)))


Accuracy Scores for the Voting Regressor model on the FIFA 22 dataset: 
R-squared test: : 0.8517660373850923
Mean Absolute Error: : 1.3917321853737024


In [62]:
# Saving the model & scaler in files (for later use)
import pickle

with open('/content/drive/My Drive/Intro to AI/Midsem Project/rf_reg.pkl', 'wb') as f:
  pickle.dump(rf_reg, f)

with open('/content/drive/My Drive/Intro to AI/Midsem Project/scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)
