#Finding the best model:

1- Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

2- Load and preprocess(Drop, Encode and scale) dataset

In [3]:
df = pd.read_csv("enriched_d_train.csv")

# Drop unused columns
df = df.drop(columns=[
    'Program Start Date', 'Program End Date',
    'Technology Type', 'Education Speaciality', 'University Degree Score System',
    'Job Type', 'Still Working', 'College', 'University Degree Score'
], errors='ignore')


# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target
X = df.drop(columns='Y')
y = df['Y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

3- Define models

LogisricRegression, RandomForest, XGBoost, KNN, SVM

In [4]:
# Define models and hyperparameter grids
models = {
    'LogisticRegression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10]
    }),
    'RandomForest': (RandomForestClassifier(), {
        'n_estimators': [100, 200],
        'max_depth': [None, 5]
    }),
    'XGBoost': (XGBClassifier(eval_metric='logloss'), {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1]
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7]
    }),
    'SVM': (SVC(probability=True), {
        'C': [0.1, 1],
        'kernel': ['linear', 'rbf']
    })
}

4- GridSearch for every model using for loop and evaluate

In [5]:
# Loop through models and perform GridSearchCV
for name, (model, params) in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = grid.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Best Params: {grid.best_params_}")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))


Training LogisticRegression...
Best Params: {'C': 10}
Accuracy: 0.8667
F1 Score: 0.5390
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       819
           1       0.60      0.49      0.54       156

    accuracy                           0.87       975
   macro avg       0.75      0.71      0.73       975
weighted avg       0.86      0.87      0.86       975


Training RandomForest...
Best Params: {'max_depth': None, 'n_estimators': 100}
Accuracy: 0.8985
F1 Score: 0.6452
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       819
           1       0.73      0.58      0.65       156

    accuracy                           0.90       975
   macro avg       0.83      0.77      0.79       975
weighted avg       0.89      0.90      0.89       975


Training XGBoost...
Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.8933
F1 Score: 0.6463
              

#Since XGBoost got the best combined results for Accuracy(89) and F1(65) we will be starting with it

1-Enhancing XGBoost with GridSearch then constructing a pipeline with oversampling using SMOTE

In [317]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Define best XGBoost model after GridSearch and manual tuning (locally)
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.6,
    min_child_weight=12.6,
    gamma=0,
    eval_metric='logloss',
    random_state=42
)

# Pipeline: Scaling + SMOTE + XGBoost
pipeline = ImbPipeline([
    ('scaler', StandardScaler()), #Scaling now in pipeline instead of before so we can export it(No double scale!)
    ('smote', SMOTE(random_state=42)),
    ('clf', xgb_model)
])

# Train model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9087
F1 Score: 0.7063
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       819
           1       0.73      0.69      0.71       156

    accuracy                           0.91       975
   macro avg       0.83      0.82      0.83       975
weighted avg       0.91      0.91      0.91       975



2- Add Threshold tuning


In [320]:
y_probs = pipeline.predict_proba(X_test)[:, 1]

thresholds = np.arange(0.3, 0.71, 0.01)
f1_scores = [f1_score(y_test, y_probs > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1_scores)]

print(f"\nBest Threshold: {best_thresh:.2f}")
print(f"Max F1 Score: {max(f1_scores):.4f}")

# Predict with best threshold
y_pred = (y_probs > best_thresh).astype(int)

# === Evaluation ===
print("\nFinal Evaluation on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


Best Threshold: 0.47
Max F1 Score: 0.7284

Final Evaluation on Test Set:
Accuracy: 0.9128
F1 Score: 0.7284
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       819
           1       0.73      0.73      0.73       156

    accuracy                           0.91       975
   macro avg       0.84      0.84      0.84       975
weighted avg       0.91      0.91      0.91       975



# Exporting the pipeline

In [11]:
import joblib

joblib.dump(pipeline,'final_model_pipeline.pkl')

['final_model_pipeline.pkl']