At this point we have already made all data cleaning and preprocessing, so we'll focus on training the first model


- Invert and encode target variable

Currently our target variable indicates, wether the students finished the grade, and in our model we'll look for dropout, so we have to invert the values and rename

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Load data

df = pd.read_csv('G:\Mi unidad\###_ ML Zoomcamp 2024\enape_post_eda.csv')

In [3]:
df.finished_grade.value_counts()

finished_grade
True     19585
False      388
Name: count, dtype: int64

In [4]:
df.finished_grade = (df.finished_grade == False).astype(int)

In [5]:
df.finished_grade.value_counts()

finished_grade
0    19585
1      388
Name: count, dtype: int64

In [6]:
# Rename target variable as "y"
df.columns = df.columns.str.replace('finished_grade', 'y')

In [7]:
df.y.value_counts()

y
0    19585
1      388
Name: count, dtype: int64

Now, we'll split our dataset and extract target variable.

We'll use a 60/20/20 % split

In [8]:
len(df) *.2

3994.6000000000004

In [9]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [10]:
len(df_train), len(df_val), len(df_test)

(11983, 3995, 3995)

In [11]:
len(y_train), len(y_val), len(y_test)

(11983, 3995, 3995)

Train  the model
- Transform training data into a dictionary and then to a vector
- Train the model

In [12]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear', class_weight='balanced',C=0.001, max_iter=100)
model.fit(X_train, y_train)

In [13]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
dropout_prediction = (y_pred >= 0.8)
(y_val == dropout_prediction).mean()

np.float64(0.9867334167709637)

In [14]:
y_val

array([0, 1, 0, ..., 0, 0, 0])

In [15]:
y_pred

array([0.22884574, 0.81869902, 0.12317139, ..., 0.31007385, 0.21973987,
       0.07474101])

In [16]:
from sklearn.metrics import classification_report
# Generar el reporte
report = classification_report(y_val, dropout_prediction)

# Imprimir el reporte
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3907
           1       0.75      0.59      0.66        88

    accuracy                           0.99      3995
   macro avg       0.87      0.79      0.83      3995
weighted avg       0.99      0.99      0.99      3995



In [17]:
from sklearn.feature_extraction import DictVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

# Paso 1: Vectorizar los datos
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Paso 2: Aplicar SMOTE para oversampling
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Paso 3: Entrenar el modelo con los datos balanceados
balanced_model = LogisticRegression(solver='liblinear', class_weight='balanced',C=0.001, max_iter=1000)
balanced_model.fit(X_train_balanced, y_train_balanced)


In [18]:
from collections import Counter
print("Distribución antes del SMOTE:", Counter(y_train))
print("Distribución después del SMOTE:", Counter(y_train_balanced))


Distribución antes del SMOTE: Counter({np.int64(0): 11747, np.int64(1): 236})
Distribución después del SMOTE: Counter({np.int64(0): 11747, np.int64(1): 11747})


In [19]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = balanced_model.predict_proba(X_val)[:, 1]
balanced_dropout_prediction = (y_pred >= 0.8)
(y_val == balanced_dropout_prediction).mean()

np.float64(0.9889862327909887)

In [20]:
from sklearn.metrics import classification_report
# Generar el reporte
balanced_report = classification_report(y_val, balanced_dropout_prediction)

# Imprimir el reporte
print(report)
print(balanced_report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3907
           1       0.75      0.59      0.66        88

    accuracy                           0.99      3995
   macro avg       0.87      0.79      0.83      3995
weighted avg       0.99      0.99      0.99      3995

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3907
           1       0.78      0.69      0.73        88

    accuracy                           0.99      3995
   macro avg       0.89      0.84      0.86      3995
weighted avg       0.99      0.99      0.99      3995



validacion con split test

In [27]:
test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

y_pred_balanced = balanced_model.predict_proba(X_test)[:, 1]
balanced_dropout_prediction = (y_pred_balanced >= 0.7)
(y_test == balanced_dropout_prediction).mean()

np.float64(0.9832290362953692)

In [28]:
balanced_report_test = classification_report(y_test, balanced_dropout_prediction)

# Imprimir el reporte
print(balanced_report_test)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3931
           1       0.49      0.83      0.61        64

    accuracy                           0.98      3995
   macro avg       0.74      0.91      0.80      3995
weighted avg       0.99      0.98      0.99      3995



In [26]:
test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

y_pred_balanced = balanced_model.predict_proba(X_test)[:, 1]
balanced_dropout_prediction = (y_pred_balanced >= 0.3)
(y_test == balanced_dropout_prediction).mean()

np.float64(0.8415519399249062)

In [None]:
balanced_report_test = classification_report(y_test, balanced_dropout_prediction)

# Imprimir el reporte
print(balanced_report_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val,dropout_prediction).round(3)

In [None]:
roc_auc_score(y_val,balanced_dropout_prediction).round(3)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy:", scores.mean())


In [None]:
from sklearn.model_selection import KFold
from tqdm.auto import tqdm

In [None]:
def train(df_train, y_train, C=1.0):
    dicts = df_train.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear',class_weight='balanced', C=1, max_iter=100)
    model.fit(X_train, y_train)

    return dv, model

In [None]:
def predict(df, dv, model):
    dicts = df.to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
scores = []

for train_idx, val_idx in kfold.split(df_full_train):

  df_train = df_full_train.iloc[train_idx]
  df_val = df_full_train.iloc[val_idx]

  y_train = df_train.y.values
  y_val = df_val.y.values

  dv, model = train(df_train, y_train)
  y_pred = predict(df_val, dv, model)

  print(y_pred)

  auc = roc_auc_score(y_val, y_pred)
  scores.append(auc)

print(scores)
print(np.std(scores))

In [None]:
scores