In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel
from category_encoders import TargetEncoder

In [38]:
def preprocess_data(file_path, n_features=1000):
    # Load the CSV file
    fifdb = pd.read_csv(file_path)
    
    print(f"Initial shape: {fifdb.shape}")
    
    # Dropping specified columns
    columns_to_drop = [
        'player_url', 'value_eur', 'wage_eur', 'player_face_url', 'club_flag_url', 'nation_logo_url', 
        'nation_flag_url', 'club_logo_url', 'club_team_id'
    ]
    new_fifdb = fifdb.drop(columns_to_drop, axis=1)
    
    print(f"Shape after dropping columns: {new_fifdb.shape}")
    
    # Extracting the 'overall' column as the target variable (y)
    y = new_fifdb['overall']
    
    # Dropping 'overall' from the features
    new_fifdb.drop('overall', axis=1, inplace=True)
    
    # Identify numeric and categorical columns
    numeric_features = new_fifdb.select_dtypes(include=[np.number]).columns
    categorical_features = new_fifdb.select_dtypes(include=['object']).columns
    
    print(f"Number of numeric features: {len(numeric_features)}")
    print(f"Number of categorical features: {len(categorical_features)}")
    
    # Handle high-cardinality categorical variables
    for col in categorical_features:
        if new_fifdb[col].nunique() > 100:  # Adjust this threshold as needed
            top_100 = new_fifdb[col].value_counts().nlargest(100).index
            new_fifdb[col] = new_fifdb[col].where(new_fifdb[col].isin(top_100), 'Other')
    
    # Defining pipelines for numeric and categorical data
    numeric_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("standardize", StandardScaler())
    ])
    
    categorical_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", TargetEncoder())
    ])
    
    # Combining pipelines into a ColumnTransformer
    preprocessor = ColumnTransformer([
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features)
    ])
    
    # Feature selection
    selector = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42), 
                               max_features=n_features, threshold=-np.inf)
    
    # Combine preprocessing and feature selection
    full_pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("select", selector)
    ])
    
    # Fit and transform the data
    X = full_pipeline.fit_transform(new_fifdb, y)
    
    print(f"Final shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    
    return X, y, full_pipeline

In [40]:
X, y, pipeline = preprocess_data('players_21.csv', n_features=10)

Initial shape: (18944, 110)
Shape after dropping columns: (18944, 101)
Number of numeric features: 56
Number of categorical features: 44
Final shape of X: (18944, 10)
Shape of y: (18944,)


In [41]:
print(X, y)

[[ 3.58656347  1.65505537 13.6766265  ... 69.20293324 69.20293324
  71.15777312]
 [ 3.42289263  2.08083818  7.31050289 ... 69.20293324 69.20293324
  70.1066804 ]
 [ 3.25922178  1.22927256 13.02473545 ... 69.85938385 69.85938385
  70.1066804 ]
 ...
 [-3.94229527  0.59059834 -0.41573031 ... 61.84210527 61.84210527
  72.92971246]
 [-0.66887843 -1.75120712 -0.40371107 ... 60.95833333 60.95833333
  57.14486902]
 [-0.66887843 -1.75120712 -0.40371107 ... 60.95833333 60.95833333
  61.79867987]] 0        93
1        92
2        91
3        91
4        91
         ..
18939    47
18940    47
18941    47
18942    47
18943    47
Name: overall, Length: 18944, dtype: int64


In [44]:
X.shape

(18944, 10)

In [46]:
y.shape

(18944,)

In [303]:
y.iloc[25:50]

25    88
26    88
27    88
28    87
29    87
30    87
31    87
32    87
33    87
34    87
35    87
36    87
37    87
38    87
39    87
40    87
41    87
42    87
43    87
44    87
45    87
46    87
47    87
48    86
49    86
Name: overall, dtype: int64

### Train-Test Split

In [48]:
from sklearn.model_selection import train_test_split

In [50]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [54]:
dt = DecisionTreeClassifier(criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=7)
sv = SVC(probability=True)
nb = GaussianNB()

### Regression
**Regressors used**
* Multilinear Regression
* Decision Tree

#### Multilinear Regression

In [58]:
from sklearn.linear_model import LinearRegression

In [60]:
l = LinearRegression()

In [62]:
# Learning association between X and Y
l.fit(Xtrain, Ytrain)

In [64]:
y_pred = l.predict(Xtest)

In [66]:
intercept = l.intercept_
coefficients = l.coef_

In [68]:
print(f"Intercept: {intercept}")
print(f"Coefficients: {coefficients}")

Intercept: 45.82959096140644
Coefficients: [ 3.19462873e+00  2.75028047e+00  5.92812627e-01  3.29345041e-01
  1.87709020e+00  2.14430273e-01 -6.53364175e+12  3.26661915e+12
  3.26702261e+12  8.81850569e-02]


In [70]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

In [72]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_pred,Ytest)},
Mean Squared Error = {mean_squared_error(y_pred,Ytest)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_pred,Ytest))},
R2 Score = {r2_score(y_pred,Ytest)}
""")


Mean Absolute Error = 1.640427891932025,
Mean Squared Error = 4.432451044256416,
Root Mean Squared Error = 2.1053387006029256,
R2 Score = 0.8968363084049331



#### Decision Tree

In [75]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(max_depth = 15)
dtree.fit(Xtrain, Ytrain)

# Model testing
y_pred = dtree.predict(Xtest)

In [77]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_pred,Ytest)},
Mean Squared Error = {mean_squared_error(y_pred,Ytest)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_pred,Ytest))},
R2 Score = {r2_score(y_pred,Ytest)}
""")


Mean Absolute Error = 0.6477816709240474,
Mean Squared Error = 1.171374955028769,
Root Mean Squared Error = 1.0823007692082498,
R2 Score = 0.9750480247423183



### Ensemble Classification

**Classifers used**
* Boosting

#### Boosting

In [82]:
import pickle as pkl
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve

In [84]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [86]:
ada=AdaBoostClassifier(base_estimator=dt,n_estimators=100)
gb=GradientBoostingClassifier(n_estimators=100)

In [88]:
for model in (ada,gb):
 model.fit(Xtrain, Ytrain)
 pkl.dump(model, open('./' + model.__class__.__name__ + '.pkl', 'wb'))
 y_pred = model.predict(Xtest)
 print(model.__class__.__name__, confusion_matrix(Ytest, y_pred), classification_report(Ytest, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoostClassifier [[1 0 1 ... 0 0 0]
 [1 2 4 ... 0 0 0]
 [0 2 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]]               precision    recall  f1-score   support

          47       0.25      0.33      0.29         3
          48       0.33      0.18      0.24        11
          49       0.05      0.12      0.07         8
          50       0.35      0.48      0.41        23
          51       0.65      0.41      0.50        37
          52       0.38      0.39      0.39        46
          53       0.38      0.41      0.40        41
          54       0.15      0.19      0.17        36
          55       0.29      0.24      0.26        66
          56       0.30      0.32      0.31        63
          57       0.44      0.39      0.41        96
          58       0.39      0.39      0.39       105
          59       0.41      0.36      0.38       107
          60       0.50      0.43      0.46       161
          61       0.45      0.51      0.48       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Ensemble Modelling
**Modelling techniques used:**
* RandomForest

#### RandomForest

In [91]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [94]:
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
rf_model.fit(Xtrain, Ytrain)

In [93]:
y_pred = rf_model.predict(Xtest)
accuracy = accuracy_score(Ytest, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(Ytest, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(Ytest, y_pred))

Accuracy: 0.6125626814462919
Classification Report:
              precision    recall  f1-score   support

          47       0.50      0.33      0.40         3
          48       0.00      0.00      0.00        11
          49       0.00      0.00      0.00         8
          50       0.39      0.48      0.43        23
          51       0.52      0.59      0.56        37
          52       0.47      0.37      0.41        46
          53       0.30      0.44      0.36        41
          54       0.25      0.28      0.26        36
          55       0.41      0.29      0.34        66
          56       0.39      0.38      0.38        63
          57       0.45      0.40      0.42        96
          58       0.43      0.51      0.47       105
          59       0.53      0.36      0.43       107
          60       0.54      0.63      0.58       161
          61       0.49      0.51      0.50       148
          62       0.58      0.49      0.53       210
          63       0.59      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

print("Model saved as rf_model.pkl")

Model saved as rf_model.pkl


### Hyperparameter tuning

In [448]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define base estimator
rf_base_estimator = RandomForestRegressor()

# Create ensemble model
ensemble = AdaBoostRegressor(base_estimator=rf_base_estimator)

# Define hyperparameter distributions
param_distributions = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'base_estimator__n_estimators': [100, 200, 300],
    'base_estimator__max_depth': [None, 10, 20]
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(ensemble, param_distributions, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(Xtrain, Ytrain)

# Make predictions
y_pred = random_search.predict(Xtest)

# Evaluate the model
mse = mean_squared_error(Ytest, y_pred)
r2 = r2_score(Ytest, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")



In [None]:
# Retrieve feature importances from the best estimator
importances = grid_search.best_estimator_.base_estimator.feature_importances_

# Create an array of feature names
feature_names = np.array([f"feature_{i}" for i in range(X.shape[1])])

# Create a DataFrame with feature names and importances
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Print top 20 important features
print(feature_importances.head(20))

In [73]:
# pip install streamlit