In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.preprocessing import StandardScaler

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 3, Finished, Available, Finished)

In [2]:
# Loading dataset

path = 'Tables/dbo/Student_variables'
df = spark.read.format('Delta').load(path)

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 4, Finished, Available, Finished)

In [3]:
df = df.toPandas()
df.head()

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 5, Finished, Available, Finished)

Unnamed: 0,StudentNumber,Age,IsChurchMemberCMIS,Num_Courses,Dropped_Courses,avg_percentage_score,courses_under_70,HDI_rank,Human_Development_Index_HDI_2023,is_enrolled_next_term
0,22226875,-0.821131,1,2.413365,9.587846,0.347521,0.682981,-1.657251,1.553922,0
1,22304618,1.232757,1,1.222608,9.587846,-0.542229,4.535895,0.919295,-0.937672,0
2,22548910,0.39253,0,1.222608,9.587846,-0.685254,4.535895,-0.442828,0.495934,0
3,23035941,2.072985,1,2.413365,14.460631,0.73712,-0.601324,-1.49314,1.428717,0
4,23062756,-1.194566,1,1.222608,14.460631,0.213112,4.535895,0.919295,-0.937672,0


In [4]:
# 1. Definig Features (X) and Label (y)

target_column = "is_enrolled_next_term"
X = df.drop(columns=[target_column, "StudentNumber"])  # quitamos ID y target
y = df[target_column]

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 6, Finished, Available, Finished)

In [5]:

# 2. Spliting on train/test
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 7, Finished, Available, Finished)

In [6]:
# 3. Defining models to be evaluated

from sklearn.utils.class_weight import compute_sample_weight

modelos = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced"),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(
        eval_metric="logloss",
        random_state=42
    ),
}


StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 8, Finished, Available, Finished)

In [7]:
# 4. Train and evaluate

resultados = []

for nombre, modelo in modelos.items():
    sample_weights = compute_sample_weight("balanced", y_train)
    modelo.fit(X_train, y_train, sample_weight=sample_weights)
    y_pred = modelo.predict(X_test)
    y_proba = modelo.predict_proba(X_test)[:, 1]

    resultados.append({
        "Modelo": nombre,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba)
    })

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 9, Finished, Available, Finished)

In [8]:
# 5. Showing results

resultados_df = pd.DataFrame(resultados).sort_values(by="AUC", ascending=False)
print("\nResultados comparativos:")
print(resultados_df)

StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 10, Finished, Available, Finished)


Resultados comparativos:
                Modelo  Accuracy  Precision    Recall        F1       AUC
2    Gradient Boosting  0.603815   0.557010  0.771804  0.647047  0.650490
3              XGBoost  0.603948   0.558366  0.757063  0.642708  0.650432
0  Logistic Regression  0.589988   0.542165  0.826798  0.654891  0.637997
1        Random Forest  0.555531   0.528358  0.515827  0.522018  0.588400


### Logistic Regression

In [10]:
# Training the model

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Defining features and Labels
X = df[['Age', 'IsChurchMemberCMIS', 'Num_Courses', 'Dropped_Courses',
        'avg_percentage_score', 'courses_under_70', 'HDI_rank',
        'Human_Development_Index_HDI_2023']]

y = df['is_enrolled_next_term']

# Spliting the dataset train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initializing and training the model
model = LogisticRegression(max_iter = 500, solver='lbfgs', class_weight='balanced')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 12, Finished, Available, Finished)

In [11]:
# Model Evaluation

from scipy.stats import chi2

# Metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Coeficients
coef_df = pd.DataFrame({
    'Variable': X.columns,
    'Coeficient': model.coef_[0]
})
print("\nModel Coeficients:")
print(coef_df)


StatementMeta(, 5aba3e94-9b77-4b70-bb9c-67165cde94f7, 13, Finished, Available, Finished)

Accuracy: 0.5951

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.44      0.53     11889
           1       0.55      0.77      0.64     10603

    accuracy                           0.60     22492
   macro avg       0.62      0.60      0.59     22492
weighted avg       0.62      0.60      0.58     22492


Confusion Matrix:
[[5198 6691]
 [2417 8186]]

Model Coeficients:
                           Variable  Coeficient
0                               Age    0.010143
1                IsChurchMemberCMIS    0.068093
2                       Num_Courses    0.151012
3                   Dropped_Courses   -0.024384
4              avg_percentage_score    0.576568
5                  courses_under_70   -0.029258
6                          HDI_rank   -0.037372
7  Human_Development_Index_HDI_2023   -0.160804
