In [27]:
# ------------------------------

# 1. Load libraries
# ------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
# ------------------------------
# 2. Load processed data
# ------------------------------
train_data = pd.read_csv('processed_train_data.csv')
test_data = pd.read_csv('processed_test_data.csv')

print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

Train shape: (20849, 68)
Test shape: (5240, 68)


In [28]:
# P(y=1 | unregistered=1)
id_columns = ['id_student', 'code_module', 'code_presentation','unregistered','date_unregistration']
p_y1_given_unreg1 = train_data[train_data['unregistered']==1]['final_result_bin'].mean()
print("P(Withdrawn | Unregistered=1):", p_y1_given_unreg1)

# P(y=1 | unregistered=0)
p_y1_given_unreg0 = train_data[train_data['unregistered']==0]['final_result_bin'].mean()
print("P(Withdrawn | Unregistered=0):", p_y1_given_unreg0)


P(Withdrawn | Unregistered=1): 0.9987531172069826
P(Withdrawn | Unregistered=0): 0.004087854222961269


In [29]:
# P(y=1 | unregistered=1)
id_columns = ['id_student', 'code_module', 'code_presentation','unregistered','date_unregistration']
p_y1_given_unreg1 = train_data[train_data['date_unregistration']<0]['final_result_bin'].mean()
print("P(Withdrawn | Unregistered=1):", p_y1_given_unreg1)

# P(y=1 | unregistered=0)
p_y1_given_unreg0 = train_data[train_data['date_unregistration']>0]['final_result_bin'].mean()
print("P(Withdrawn | Unregistered=0):", p_y1_given_unreg0)


P(Withdrawn | Unregistered=1): 0.99880810488677
P(Withdrawn | Unregistered=0): 0.9997760859829825


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20849 entries, 0 to 20848
Data columns (total 68 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id_student                   20849 non-null  int64  
 1   gender                       20849 non-null  int64  
 2   highest_education            20849 non-null  int64  
 3   age_band                     20849 non-null  int64  
 4   num_of_prev_attempts         20849 non-null  int64  
 5   studied_credits              20849 non-null  int64  
 6   disability                   20849 non-null  int64  
 7   date_registration            20849 non-null  float64
 8   date_unregistration          20849 non-null  float64
 9   module_presentation_length   20849 non-null  int64  
 10  unregistered                 20849 non-null  int64  
 11  final_result_bin             20849 non-null  int64  
 12  imd_band_numeric             20849 non-null  float64
 13  year            

In [13]:
train_data

Unnamed: 0,id_student,gender,highest_education,age_band,num_of_prev_attempts,studied_credits,disability,date_registration,date_unregistration,module_presentation_length,...,sharedsubpage_click_ratio,subpage_click_ratio,url_click_ratio,first_click_lag,engagement_span,n_total_assessments,assessment_completion_ratio,has_previous_attempts,n_activity_types,code_module
0,11391,0,3,2,0,240,0,-159.0,0.0,268,...,0.0,0.043478,0.001890,-5.0,59.0,6,0.333333,0,9,AAA
1,28400,1,3,1,0,60,0,-53.0,0.0,268,...,0.0,0.091181,0.047833,-10.0,63.0,6,0.333333,0,9,AAA
2,30268,1,2,1,0,60,1,-92.0,12.0,268,...,0.0,0.078292,0.014235,-10.0,22.0,6,0.000000,0,9,AAA
3,38053,0,2,1,0,60,0,-110.0,0.0,268,...,0.0,0.037811,0.015920,-10.0,70.0,6,0.166667,0,9,AAA
4,45642,1,2,0,0,120,0,-29.0,0.0,268,...,0.0,0.087838,0.042230,-9.0,66.0,6,0.333333,0,9,AAA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20844,2608143,0,3,1,0,30,0,-45.0,48.0,269,...,0.0,0.108108,0.000000,9.0,39.0,10,0.000000,0,7,GGG
20845,2620947,1,2,0,0,30,1,-23.0,0.0,269,...,0.0,0.107143,0.000000,-4.0,64.0,10,0.100000,0,7,GGG
20846,2640965,1,1,0,0,30,0,-4.0,0.0,269,...,0.0,0.219512,0.000000,-4.0,23.0,10,0.000000,0,7,GGG
20847,2645731,1,1,1,0,30,0,-23.0,0.0,269,...,0.0,0.063291,0.000000,19.0,39.0,10,0.000000,0,7,GGG


In [15]:
# ------------------------------
# 3. Define features and target
# ------------------------------
# IMPORTANT: replace 'final_result_bin' with your actual target column name if different
target_column = 'final_result_bin'
id_columns = ['id_student', 'code_module', 'code_presentation','unregistered','date_unregistration']
# ------------------------------
# 4. Get unique modules
# ------------------------------
all_modules = train_data['code_module'].unique()
print("Modules to model separately:", all_modules)

Modules to model separately: ['AAA' 'BBB' 'CCC' 'DDD' 'EEE' 'FFF' 'GGG']


In [17]:
# ------------------------------
# 5. Make output folders
# ------------------------------
os.makedirs("models", exist_ok=True)
os.makedirs("plots", exist_ok=True)
os.makedirs("results", exist_ok=True)
plt.rcParams['font.family'] = 'Times New Roman'
# ------------------------------
# 6. Store overall results
# ------------------------------
all_results = []

# ------------------------------
# 7. Loop over modules
# ------------------------------
for module in all_modules:
    print("\n===================================")
    print(f"Training for module: {module}")
    print("===================================")

    # Filter module-specific data
    train_mod = train_data[train_data['code_module'] == module].copy()
    test_mod  = test_data[test_data['code_module'] == module].copy()

    if len(train_mod) < 50 or len(test_mod) < 20:
        print(f"Skipping module {module} (insufficient samples)")
        continue

    # Define features
    features = [col for col in train_mod.columns if col not in id_columns + [target_column]]
    X_train, y_train = train_mod[features], train_mod[target_column]
    X_test, y_test = test_mod[features], test_mod[target_column]

    print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
    print(f"Features used ({len(features)}): {features[:5]}...")

    # ------------------------------
    # 7.1 Define GridSearch
    # ------------------------------
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5 ,10]
    }
    clf = RandomForestClassifier(random_state=42)
    grid = GridSearchCV(
        clf,
        param_grid,
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    best_params = grid.best_params_

    print(f"Best params for {module}: {best_params}")

    # ------------------------------
    # 7.2 Save model
    # ------------------------------
    with open(f"models/{module}_rf_model.pkl", 'wb') as f:
        pickle.dump(best_model, f)
    print(f"Saved model for {module}")

    # ------------------------------
    # 7.3 Predict & Evaluate
    # ------------------------------
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"AUC-ROC:   {auc:.3f}")

    # ------------------------------
    # 7.4 Confusion Matrix Plot
    # ------------------------------
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(16,9))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {module}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_confusion_matrix_rf.png",dpi=600)
    plt.close()

    # ------------------------------
    # 7.5 Feature Importances Plot
    # ------------------------------
    importances = pd.Series(best_model.feature_importances_, index=features).sort_values(ascending=False)
    plt.figure(figsize=(16,9))
    importances.plot(kind='bar')
    plt.title(f'Feature Importances - {module}')
    plt.ylabel('Importance Score')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_feature_importances_rf.png",dpi=600)
    plt.close()

    # ------------------------------
    # 7.6 Combined Plot
    # ------------------------------
    fig, ax = plt.subplots(1,2, figsize=(16,12))
    # Confusion Matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
    ax[0].set_title('Confusion Matrix')
    ax[0].set_xlabel('Predicted')
    ax[0].set_ylabel('Actual')
    # Feature Importances
    importances.plot(kind='bar', ax=ax[1])
    ax[1].set_title('Feature Importances')
    ax[1].set_ylabel('Importance Score')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_combined_rf.png",dpi=600)
    plt.close()

    print(f"Saved plots for {module}")

    # ------------------------------
    # 7.7 Store Results
    # ------------------------------
    all_results.append({
            'module': module,
            'train_samples': len(X_train),
            'test_samples': len(X_test),
            'accuracy': round(acc, 3),
            'precision': round(prec, 3),
            'recall': round(rec, 3),
            'f1_score': round(f1, 3),
            'auc_roc': round(auc, 3),
            'best_params': best_params,
            'n_features': len(features)
    })


    # ------------------------------
    # 7.8 Save Feature List
    # ------------------------------
    with open(f"models/{module}_features_rf.txt", 'w') as f:
        for feat in features:
            f.write(f"{feat}\n")
    print(f"Saved feature list for {module}")

# ------------------------------
# 8. Save results to Excel
# ------------------------------
results_df = pd.DataFrame(all_results)
results_df.to_excel('results/module_level_results_rf.xlsx', index=False)
print("\nAll module-level results saved to Excel!")
print(results_df)


Training for module: AAA
Training samples: 490, Testing samples: 121
Features used (63): ['gender', 'highest_education', 'age_band', 'num_of_prev_attempts', 'studied_credits']...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params for AAA: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Saved model for AAA
Accuracy:  0.851
Precision: 0.800
Recall:    0.190
F1 Score:  0.308
AUC-ROC:   0.788
Saved plots for AAA
Saved feature list for AAA

Training for module: BBB
Training samples: 5021, Testing samples: 1294
Features used (63): ['gender', 'highest_education', 'age_band', 'num_of_prev_attempts', 'studied_credits']...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params for BBB: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Saved model for BBB
Accuracy:  0.850
Precision: 0.796
Recall:    0.657
F1 Score:  0.720
AUC-ROC:   0.875
Saved plots for BBB
Saved feature list for BBB

Training for module: CCC
Training sample

In [23]:
# -----------------------------------
# 6. Store results
# -----------------------------------
all_results = []

# -----------------------------------
# 7. Loop over modules
# -----------------------------------
for module in all_modules:
    print("\n==========================")
    print(f"Module: {module}")
    print("==========================")

    # Filter data
    train_mod = train_data[train_data['code_module'] == module].copy()
    test_mod  = test_data[test_data['code_module'] == module].copy()

    if len(train_mod) < 50 or len(test_mod) < 20:
        print(f"Skipping {module} (too few samples)")
        continue

    # Define features
    features = [col for col in train_mod.columns if col not in id_columns + [target_column]]
    X_train, y_train = train_mod[features], train_mod[target_column]
    X_test, y_test = test_mod[features], test_mod[target_column]

    print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

    # -----------------------------------
    # 7.1 GridSearchCV
    # -----------------------------------
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100]
    }
    logit = LogisticRegression(penalty='l2', solver='liblinear', random_state=42, max_iter=500)
    grid = GridSearchCV(logit, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    print(f"Best C for {module}: {best_params['C']}")

    # -----------------------------------
    # 7.2 Save Model
    # -----------------------------------
    with open(f"models/{module}_logit_model.pkl", 'wb') as f:
        pickle.dump(best_model, f)
    print(f"Model saved for {module}")

    # -----------------------------------
    # 7.3 Predict & Evaluate
    # -----------------------------------
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"AUC-ROC:   {auc:.3f}")

    # -----------------------------------
    # 7.4 Confusion Matrix
    # -----------------------------------
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(16,9))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {module}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_confusion_matrix_logit.png",dpi=600)
    plt.close()

    # -----------------------------------
    # 7.5 Feature Importance (Coefficients)
    # -----------------------------------
    coef_series = pd.Series(best_model.coef_[0], index=features).sort_values(ascending=False)
    plt.figure(figsize=(16,9))
    coef_series.plot(kind='bar')
    plt.title(f'Feature Coefficients - {module}')
    plt.ylabel('Coefficient')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_feature_coefficients_logit.png",dpi=600)
    plt.close()

    # -----------------------------------
    # 7.6 Combined Plot
    # -----------------------------------
    fig, ax = plt.subplots(1,2, figsize=(16,10))
    # Confusion Matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
    ax[0].set_title('Confusion Matrix')
    ax[0].set_xlabel('Predicted')
    ax[0].set_ylabel('Actual')
    # Coefficients
    coef_series.plot(kind='bar', ax=ax[1])
    ax[1].set_title('Feature Coefficients')
    ax[1].set_ylabel('Coefficient')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_combined_logit.png",dpi=600)
    plt.close()

    print(f"Plots saved for {module}")

    # -----------------------------------
    # 7.7 Store results
    # -----------------------------------
    all_results.append({
            'module': module,
            'train_samples': len(X_train),
            'test_samples': len(X_test),
            'accuracy': round(acc, 3),
            'precision': round(prec, 3),
            'recall': round(rec, 3),
            'f1_score': round(f1, 3),
            'auc_roc': round(auc, 3),
            'best_params': best_params,
            'n_features': len(features)
        })

    # -----------------------------------
    # 7.8 Save feature list
    # -----------------------------------
    with open(f"models/{module}_features_logit.txt", 'w') as f:
        for feat in features:
            f.write(f"{feat}\n")
    print(f"Feature list saved for {module}")

# -----------------------------------
# 8. Save all results
# -----------------------------------
results_df = pd.DataFrame(all_results)
results_df.to_excel('results/module_level_results_logit.xlsx', index=False)
print("\nAll module results saved to Excel!")
print(results_df)


Module: AAA
Training samples: 490, Testing samples: 121
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C for AAA: 0.01
Model saved for AAA
Accuracy:  0.860
Precision: 1.000
Recall:    0.190
F1 Score:  0.320
AUC-ROC:   0.729
Plots saved for AAA
Feature list saved for AAA

Module: BBB
Training samples: 5021, Testing samples: 1294
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C for BBB: 10
Model saved for BBB
Accuracy:  0.857
Precision: 0.796
Recall:    0.689
F1 Score:  0.738
AUC-ROC:   0.877
Plots saved for BBB
Feature list saved for BBB

Module: CCC
Training samples: 2848, Testing samples: 706
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C for CCC: 100
Model saved for CCC
Accuracy:  0.795
Precision: 0.787
Recall:    0.706
F1 Score:  0.744
AUC-ROC:   0.866
Plots saved for CCC
Feature list saved for CCC

Module: DDD
Training samples: 3986, Testing samples: 993
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C for D

In [21]:
# -----------------------------------
# 6. Store results
# -----------------------------------
all_results = []

# -----------------------------------
# 7. Loop over modules
# -----------------------------------
for module in all_modules:
    print("\n==========================")
    print(f"Module: {module}")
    print("==========================")

    # Filter data
    train_mod = train_data[train_data['code_module'] == module].copy()
    test_mod  = test_data[test_data['code_module'] == module].copy()

    if len(train_mod) < 50 or len(test_mod) < 20:
        print(f"Skipping {module} (too few samples)")
        continue

    # -----------------------------------
    # Define numeric features only
    # -----------------------------------
    features = [col for col in train_mod.columns if col not in id_columns + [target_column]]
    X_train, y_train = train_mod[features], train_mod[target_column]
    X_test, y_test = test_mod[features], test_mod[target_column]

    print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

    # -----------------------------------
    # 7.1 GridSearchCV with XGBoost
    # -----------------------------------
    from xgboost import XGBClassifier
    import pickle

    param_grid = {
        'max_depth': [3, 5, 7],
        'n_estimators': [50, 100],
        'learning_rate': [0.05, 0.1]
    }

    xgb_model = XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )

    grid = GridSearchCV(
        xgb_model,
        param_grid,
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    print(f"Best params for {module}: {best_params}")

    # -----------------------------------
    # 7.2 Save Model
    # -----------------------------------
    with open(f"models/{module}_xgb_model.pkl", 'wb') as f:
        pickle.dump(best_model, f)
    print(f"Model saved for {module}")

    # -----------------------------------
    # 7.3 Predict & Evaluate
    # -----------------------------------
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"AUC-ROC:   {auc:.3f}")

    # -----------------------------------
    # 7.4 Confusion Matrix
    # -----------------------------------
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(16,9))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {module}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_confusion_matrix_xgb.png", dpi=600)
    plt.close()

    # -----------------------------------
    # 7.5 Feature Importance
    # -----------------------------------
    importances = pd.Series(best_model.feature_importances_, index=features).sort_values(ascending=False)
    plt.figure(figsize=(16,9))
    importances.plot(kind='bar')
    plt.title(f'Feature Importances - {module}')
    plt.ylabel('Importance Score')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_feature_importances_xgb.png", dpi=600)
    plt.close()

    # -----------------------------------
    # 7.6 Combined Plot
    # -----------------------------------
    fig, ax = plt.subplots(1,2, figsize=(16,10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
    ax[0].set_title('Confusion Matrix')
    ax[0].set_xlabel('Predicted')
    ax[0].set_ylabel('Actual')
    importances.plot(kind='bar', ax=ax[1])
    ax[1].set_title('Feature Importances')
    ax[1].set_ylabel('Importance Score')
    plt.tight_layout()
    plt.savefig(f"plots/{module}_combined_xgb.png", dpi=600)
    plt.close()

    print(f"Plots saved for {module}")

    # -----------------------------------
    # 7.7 Store results
    # -----------------------------------
    all_results.append({
        'module': module,
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'accuracy': round(acc, 3),
        'precision': round(prec, 3),
        'recall': round(rec, 3),
        'f1_score': round(f1, 3),
        'auc_roc': round(auc, 3),
        'best_params': best_params,
        'n_features': len(features)
    })


    # -----------------------------------
    # 7.8 Save feature list
    # -----------------------------------
    with open(f"models/{module}_features_xgb.txt", 'w') as f:
        for feat in features:
            f.write(f"{feat}\n")
    print(f"Feature list saved for {module}")

# -----------------------------------
# 8. Save all results
# -----------------------------------
results_df = pd.DataFrame(all_results)
results_df.to_excel('results/module_level_results_xgb.xlsx', index=False)
print("\nAll module results saved to Excel!")
print(results_df)



Module: AAA
Training samples: 490, Testing samples: 121
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for AAA: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 50}
Model saved for AAA
Accuracy:  0.843
Precision: 0.667
Recall:    0.190
F1 Score:  0.296
AUC-ROC:   0.751
Plots saved for AAA
Feature list saved for AAA

Module: BBB
Training samples: 5021, Testing samples: 1294
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for BBB: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 50}
Model saved for BBB
Accuracy:  0.848
Precision: 0.784
Recall:    0.662
F1 Score:  0.718
AUC-ROC:   0.879
Plots saved for BBB
Feature list saved for BBB

Module: CCC
Training samples: 2848, Testing samples: 706
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for CCC: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Model saved for CCC
Accuracy:  0.796
Precision: 0.778
Recall:    0.726
F1 Score:  0.751
AUC-ROC:   0.873
Plots saved for CCC
Feature list saved for CCC

Module: DDD
Training samples: 3986, Testing samples: 993
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for DDD: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Model saved for DDD
Accuracy:  0.810
Precision: 0.804
Recall:    0.651
F1 Score:  0.719
AUC-ROC:   0.855
Plots saved for DDD
Feature list saved for DDD

Module: EEE
Training samples: 1885, Testing samples: 480
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for EEE: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Model saved for EEE
Accuracy:  0.867
Precision: 0.738
Recall:    0.687
F1 Score:  0.712
AUC-ROC:   0.884
Plots saved for EEE
Feature list saved for EEE

Module: FFF
Training samples: 4986, Testing samples: 1231
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for FFF: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Model saved for FFF
Accuracy:  0.825
Precision: 0.818
Recall:    0.587
F1 Score:  0.683
AUC-ROC:   0.851
Plots saved for FFF
Feature list saved for FFF

Module: GGG
Training samples: 1633, Testing samples: 415
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for GGG: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Model saved for GGG
Accuracy:  0.913
Precision: 0.727
Recall:    0.348
F1 Score:  0.471
AUC-ROC:   0.785
Plots saved for GGG
Feature list saved for GGG

All module results saved to Excel!
  module  train_samples  test_samples  accuracy  precision  recall  f1_score  \
0    AAA            490           121     0.843      0.667   0.190     0.296   
1    BBB           5021          1294     0.848      0.784   0.662     0.718   
2    CCC           2848           706     0.796      0.778   0.726     0.751   
3    DDD           3986           993     0.810      0.804   0.651     0.719   
4    EEE           1885           480     0.867      0.738   0.687     0.712   
5    FFF           4986          1231     0.825      0.818   0.587     0.683   
6    GGG           1633           415     0.913      0.727   0.348     0.471   

   auc_roc                                        best_params  n_features  
0    0.751  {'