In [54]:
import pandas as pd
import numpy as np
import openpyxl
import json
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Split data
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Evaluate Model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, f1_score

### Import Datas

In [2]:
df_pl = pd.read_excel('Premier-League-Player-Stats-2024-2025.xlsx')
df_indo = pd.read_excel('Liga-1-Indonesia-Player-Stats-2024-2025.xlsx')

### Check Datas

In [None]:
def checking_dataframe(df):
    print('Shape:',df.shape)
    print('---' * 18)
    print('missing_values:')
    print(df.isnull().sum())
    print('---' * 18)
    print('duplicates:',df.duplicated().sum())
    print('---' * 18)
    print('dtypes:')
    print(df.dtypes)
    
checking_dataframe(df_pl)

Shape: (790, 14)
------------------------------------------------------
missing_values:
Name                   0
Date of Birth          0
Nationality            0
Height (cm)            0
Position               0
Appearances            0
Minutes played         0
Goals                  0
Assists                0
Clean sheets           0
Goals conceded         0
Yellow cards           0
Second yellow cards    0
Red cards              0
dtype: int64
------------------------------------------------------
duplicates: 20
------------------------------------------------------
dtypes:
Name                   object
Date of Birth          object
Nationality            object
Height (cm)             int64
Position               object
Appearances             int64
Minutes played         object
Goals                   int64
Assists                 int64
Clean sheets            int64
Goals conceded          int64
Yellow cards            int64
Second yellow cards     int64
Red cards               in

In [21]:
position_mapping = {
    'Centre-Back': 'Defender',
    'Left-Back': 'Defender',
    'Right-Back': 'Defender',
    'Defensive Midfield': 'Midfielder',
    'Central Midfield': 'Midfielder',
    'Attacking Midfield': 'Midfielder',
    'Left Midfield': 'Midfielder',
    'Right Midfield': 'Midfielder',
    'Left Winger': 'Forward',
    'Right Winger': 'Forward',
    'Second Striker': 'Forward',
    'Centre-Forward': 'Forward',
    'Attack': 'Forward'
    }

In [22]:
def cleaning_feature(df, position_mapping):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # New Column 'Age'
    df['Date of Birth'] = df['Date of Birth'].replace(0, np.nan)
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')
    df['Age'] = df['Date of Birth'].apply(lambda x: (datetime.now() - x).days // 365 if pd.notnull(x) else 0)
    
    # 'Minutes played' to int
    df['Minutes played'] = (
    df['Minutes played']
    .astype(str)
    .str.replace("'","", regex=False)
    .str.replace(".","", regex=False)
    .str.strip()
    .astype(int)
    )

    # Map 'Position'
    df['Position'] =  df['Position'].replace(position_mapping).astype(str)
    
    # New Column 'G+A per 90'
    df['Goals per 90'] = df['Goals'] / (df['Minutes played'] / 90)
    df['Assists per 90'] = df['Assists'] / (df['Minutes played'] / 90)
    df['G+A per 90'] = df['Goals per 90'] + df['Assists per 90']

    # Handle NaN
    df.fillna(0, inplace=True)
    
    return df
    

### New Premier League Dataframe

In [23]:
df_pl = cleaning_feature(df_pl, position_mapping)

### Label Premier League Dataframe

In [24]:
df_pl['label'] = 0

df_pl.loc[
    (df_pl['Age'] <= 23) & 
    (df_pl['Height (cm)'] >= 174) & 
    (df_pl['Minutes played'] >= 600) & 
    (df_pl['Yellow cards'] <= 5) & 
    (df_pl['Second yellow cards'] <= 1) & 
    (df_pl['Red cards'] <= 1) & 
    (df_pl['G+A per 90'] >=0.3) & 
    ((df_pl['Position'] == 'Forward') | (df_pl['Position'] == 'Midfielder')),
    'label'
] = 1

df_pl.loc[
    (df_pl['Age'] <= 23) & 
    (df_pl['Height (cm)'] >= 180) & 
    (df_pl['Minutes played'] >= 500) & 
    (df_pl['Yellow cards'] <= 5) & 
    (df_pl['Second yellow cards'] <= 1) & 
    (df_pl['Red cards'] <= 1) & 
    (df_pl['Clean sheets'] >= 1) &
    (df_pl['Goals conceded'] <= 10) &
    (df_pl['Position'] == 'Goalkeeper'),
    'label'
] = 1

In [76]:
print(df_pl['label'].value_counts(normalize=True))

label
0    0.976623
1    0.023377
Name: proportion, dtype: float64


### Prepare Train and Test Data

In [25]:
features = ['Age', 'Height (cm)', 'Yellow cards', 
            'Second yellow cards', 'Red cards',
            'G+A per 90', 'Minutes played',
            'Goals conceded', 'Clean sheets']

In [26]:
X = df_pl[features]
y = df_pl['label']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

### Tuning `Logisitic Regression`

In [28]:
pipe_logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

param_logreg = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'clf__class_weight': [None, 'balanced']
}

grid_logreg = GridSearchCV(pipe_logreg, param_logreg, cv=5, n_jobs=-1)
grid_logreg.fit(X_train, y_train)

print("Logistic Regression Best Params:", grid_logreg.best_params_)

Logistic Regression Best Params: {'clf__C': 0.01, 'clf__class_weight': None, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}


### Create `Logisitic Regression` Model

In [None]:
best_params_logreg = grid_logreg.best_params_

# Clean prefix 'clf__'
clean_params = {k.replace('clf__', ''): v for k, v in best_params_logreg.items()}

# Final Pipeline and Re-train
model_logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(**clean_params, max_iter=1000))
])

model_logreg.fit(X, y)

### Tuning `Random Forest`

In [30]:
pipe_rf = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

param_rf = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(pipe_rf, param_rf, cv=5, n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Random Forest Best Params:", grid_rf.best_params_)

Random Forest Best Params: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}


### Create `Random Forest` Model

In [None]:
best_params_rf = grid_rf.best_params_

# Clean prefix 'clf__'
clean_params = {k.replace('clf__', ''): v for k, v in best_params_rf.items()}

# Final Pipeline and Re-train
model_rf = Pipeline([
    ('clf', RandomForestClassifier(**clean_params, random_state=42))
])

model_rf.fit(X, y)

### Tuning `Support Vector Machine`

In [32]:
pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC())
])

param_svm = {
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(pipe_svm, param_svm, cv=5, n_jobs=-1)
grid_svm.fit(X_train, y_train)

print("SVM Best Params:", grid_svm.best_params_)

SVM Best Params: {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}


### Create `Support Vector Machine` Model

In [None]:
best_params_svm = grid_svm.best_params_

# Clean prefix 'clf__'
clean_params = {k.replace('clf__', ''): v for k, v in best_params_svm.items()}

# Final Pipeline and Re-train
model_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(probability=True,**clean_params))
])

model_svm.fit(X, y)

### Tuning `K-Nearest Neighbors`

In [34]:
pipe_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

param_knn = {
    'clf__n_neighbors': [3, 5, 7, 9],
    'clf__weights': ['uniform', 'distance'],
    'clf__metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(pipe_knn, param_knn, cv=5, n_jobs=-1)
grid_knn.fit(X_train, y_train)

print("KNN Best Params:", grid_knn.best_params_)

KNN Best Params: {'clf__metric': 'euclidean', 'clf__n_neighbors': 3, 'clf__weights': 'uniform'}


### Create `K-Nearest Neighbors` Model

In [None]:
best_params_knn = grid_knn.best_params_

# Clean prefix 'clf__'
clean_params = {k.replace('clf__', ''): v for k, v in best_params_knn.items()}

# Final Pipeline and Re-train
model_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier(**clean_params))
])

model_knn.fit(X, y)

In [36]:
models = {
    'Logistic Regression': model_logreg,
    'Random Forest': model_rf,
    'SVM': model_svm,
    'KNN': model_knn
}

### Evaluating Model

In [61]:
for name, model in models.items():
    acc = accuracy_score(y_test, model.predict(X_test))
    report = classification_report(y_test, model.predict(X_test))
    conf = confusion_matrix(y_test, model.predict(X_test))
    print(f"{name} Accuracy : {acc*100:.2f}%")
    print(f"{report}")
    print(f"{name} Confusion Matrix")
    print(f"{conf}")
    print("---" * 40)

Logistic Regression Accuracy : 96.75%
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       149
           1       0.00      0.00      0.00         5

    accuracy                           0.97       154
   macro avg       0.48      0.50      0.49       154
weighted avg       0.94      0.97      0.95       154

Logistic Regression Confusion Matrix
[[149   0]
 [  5   0]]
------------------------------------------------------------------------------------------------------------------------
Random Forest Accuracy : 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       149
           1       1.00      1.00      1.00         5

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest Confusion Matrix
[[149   0]
 [  0   5]]
----------------------------------------

### Checking Cross Validation of Model

In [65]:
# F1 : binary classification | balanced and imbalanced dataset
# Accuracy : binary classification and multiclass classification | balanced dataset | how many right predictions
# ROC_AUC : binary classification | imbalanced dataset | mean : 0.5 (bad) - 1.0 (perfect)

# Mean : Mean Model Performance
# Std : Performance Between Folds (the less, more stable)
print('cross validation with roc score:')
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')  # 'accuracy', 'roc_auc', f1-score
    print(f"{name} | Mean: {scores.mean():.4f} & Std: {scores.std():.4f}")

cross validation with roc score:
Logistic Regression | Mean: 0.5000 & Std: 0.0000
Random Forest | Mean: 0.9644 & Std: 0.0668
SVM | Mean: 0.6898 & Std: 0.1536
KNN | Mean: 0.8013 & Std: 0.1186


### Predict Accuracy Data Train and Test

In [None]:
train_acc = accuracy_score(y_train, model_rf.predict(X_train))
test_acc = accuracy_score(y_test, model_rf.predict(X_test))

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")

### Liga 1 Indonesia Player Prediction

In [45]:
df_indo = cleaning_feature(df_indo, position_mapping)

In [46]:
X_indo = df_indo[features]

In [47]:
prediction = model_rf.predict(X_indo)

In [48]:
df_indo['Predictions'] = prediction
df_indo['Predictions'] = df_indo['Predictions'].apply(lambda x: 'Potential' if x == 1 else 'Not Potential')

In [71]:
print('Potential Player Count: ',df_indo[df_indo['Predictions'] == 'Potential'].shape[0])
df_indo[df_indo['Predictions'] == 'Potential']

Potential Player Count:  0


Unnamed: 0,Name,Date of Birth,Nationality,Height (cm),Position,Appearances,Minutes played,Goals,Assists,Clean sheets,Goals conceded,Yellow cards,Second yellow cards,Red cards,Age,Goals per 90,Assists per 90,G+A per 90,Predictions


### Single Player Prediction

In [59]:
def predict_new_player(player_dict, model, position_mapping):
    # Calculate Age
    dob = datetime.strptime(player_dict['Date of Birth'], "%Y-%m-%d")
    age = (datetime.now() - dob).days // 365

    # Position
    position = position_mapping.get(player_dict['Position'], player_dict['Position'])

    # G+A per 90
    minutes_played = int(player_dict['Minutes played'])
    goals_per_90 = player_dict['Goals'] / (minutes_played / 90)
    assists_per_90 = player_dict['Assists'] / (minutes_played / 90)
    ga_per_90 = goals_per_90 + assists_per_90

    # Features
    features = [
        age,
        player_dict['Height (cm)'],
        player_dict['Yellow cards'],
        player_dict['Second yellow cards'],
        player_dict['Red cards'],
        ga_per_90,
        minutes_played,
        player_dict['Goals conceded'],
        player_dict['Clean sheets']
    ]

    # 5. Prediction (array 2D)
    prediction = model.predict([features])[0]

    result = {
        "Name": player_dict["Name"],
        "Age": age,
        "Position": player_dict['Position'],
        "Result": "Potential" if prediction == 1 else "Not Potential"
        
    }
    
    return json.dumps(result, indent=4)

In [72]:
player = {
    'Name': 'Arthur',
    'Date of Birth': '2005-05-15',
    'Height (cm)': 180,
    'Yellow cards': 1,
    'Second yellow cards': 0,
    'Red cards': 0,
    'Goals': 12,
    'Assists': 10,
    'Minutes played': 1200,
    'Clean sheets': 0,
    'Goals conceded': 0,
    'Position': 'Attacking Midfield'
}

result = predict_new_player(player, model_rf, position_mapping)
print(result)

{
    "Name": "Arthur",
    "Age": 20,
    "Position": "Attacking Midfield",
    "Result": "Potential"
}
