In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from scipy import stats
from itertools import combinations
from xgboost import XGBClassifier
import pickle
import re
from pycaret.classification import setup, compare_models, create_model, tune_model, plot_model, evaluate_model, finalize_model, predict_model, save_model, load_model

from feature_engine.creation import MathFeatures
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.imputation import MeanMedianImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from pycaret.classification import predict_model, finalize_model, get_config, create_model
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = "C:/Users/jesco/OneDrive - Universidad Santo Tomás/Documentos/Python/T3/"

In [3]:
%%time
df = pd.read_csv(path  + 'train.csv')
prueba = pd.read_csv(path + "test.csv")

CPU times: total: 188 ms
Wall time: 238 ms


In [4]:
df = df.rename(columns={"Mother's occupation": "Mothers occupation", "Father's occupation": "Fathers occupation"})
prueba = prueba.rename(columns={"Mother's occupation": "Mothers occupation", "Father's occupation": "Fathers occupation"})

In [5]:
# ct = ['Daytime/evening attendance', 'Displaced', 'Educational special needs', 'Debtor',
#       'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International', 'Marital status',
#      'Application mode', 'Application order', 'Course', 'Previous qualification', 'Nacionality',
#      "Mother's qualification", "Father's qualification", "Mother's occupation",
#      "Father's occupation", ]

ct = ['Gender', 'Displaced', 'Educational special needs', 'Debtor', 'Scholarship holder', 
      'International', 'Marital status', 'Nacionality', 'Mothers occupation','Fathers occupation']

for k in ct:
  df[k] = df[k].astype("O")
  prueba[k] = prueba[k].astype("O")

In [6]:
df.dtypes

id                                                  int64
Marital status                                     object
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance                          int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                        object
Mother's qualification                              int64
Father's qualification                              int64
Mothers occupation                                 object
Fathers occupation                                 object
Admission grade                                   float64
Displaced                                          object
Educational special needs                          object
Debtor                                             object
Tuition fees u

## Base modelo

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Target"] = le.fit_transform(df["Target"])

In [8]:
base_modelo = df.copy()
base_modelo["Target"] = df["Target"].copy()
base_modelo["Target"] = base_modelo["Target"].map(int)
base_modelo.head(3)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,2
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,0
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,0


## AUTOML

In [9]:
column_types = base_modelo.dtypes
print(column_types)

id                                                  int64
Marital status                                     object
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance                          int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                        object
Mother's qualification                              int64
Father's qualification                              int64
Mothers occupation                                 object
Fathers occupation                                 object
Admission grade                                   float64
Displaced                                          object
Educational special needs                          object
Debtor                                             object
Tuition fees u

In [10]:
formatos = pd.DataFrame(base_modelo.dtypes).reset_index()
formatos.columns = ["Variable", "Formato"]

# Identificar variables cuantitativas y categóricas
cuantitativas_bm = [x for x in formatos.loc[formatos["Formato"] != "object", "Variable"] if x not in ["id", "Target"]]
categoricas_bm = [x for x in formatos.loc[formatos["Formato"] == "object", "Variable"] if x not in ["id", "Target"]]

In [12]:
# Asumiendo que 'base_modelo' es tu DataFrame principal

# Separar características y la variable objetivo
X = base_modelo.drop(columns=['Target'])
y = base_modelo['Target']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Codificación de variables categóricas
rare_encoder = RareLabelEncoder(tol=0.05, n_categories=2, variables=categoricas_bm)
X_train = rare_encoder.fit_transform(X_train)
X_test = rare_encoder.transform(X_test)

one_hot_encoder = OneHotEncoder(drop_last=True, variables=categoricas_bm)
X_train = one_hot_encoder.fit_transform(X_train)
X_test = one_hot_encoder.transform(X_test)

# Crear características matemáticas (suma, multiplicación)
math_transformer = MathFeatures(variables=cuantitativas_bm, func=['sum', 'prod'])
X_train = math_transformer.fit_transform(X_train)
X_test = math_transformer.transform(X_test)

# Crear características polinómicas (cuadráticas) y de interacción usando sklearn
poly_transformer = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
X_train_poly = poly_transformer.fit_transform(X_train[cuantitativas_bm])
X_test_poly = poly_transformer.transform(X_test[cuantitativas_bm])

# Concatenar las nuevas características polinómicas al conjunto de datos original
X_train = np.hstack([X_train, X_train_poly])
X_test = np.hstack([X_test, X_test_poly])

# Escalar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convertir de vuelta a DataFrame y añadir la columna 'Target'
X_train = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
X_train['Target'] = y_train.reset_index(drop=True)

X_test = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])
X_test['Target'] = y_test.reset_index(drop=True)

# Concatenar para usar en PyCaret
base_modelo = pd.concat([X_train, X_test], axis=0)

In [13]:
base_modelo.reset_index(drop=True, inplace=True)

exp_clf101 = setup(
    data=base_modelo,
    target='Target',
    session_id=123,
    train_size=0.7,
    numeric_features=[f'feature_{i}' for i in range(X_train.shape[1] - 1)],  # última columna es 'Target'
    fix_imbalance=True
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Target
2,Target type,Multiclass
3,Original data shape,"(76518, 429)"
4,Transformed data shape,"(99147, 429)"
5,Transformed train set shape,"(76191, 429)"
6,Transformed test set shape,"(22956, 429)"
7,Numeric features,428
8,Preprocess,True
9,Imputation type,simple


In [14]:
top4_models = compare_models(sort='Accuracy', n_select=1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8283,0.9424,0.8283,0.8278,0.8271,0.724,0.7251,14.721
xgboost,Extreme Gradient Boosting,0.827,0.9398,0.827,0.8257,0.8254,0.7218,0.7228,9.09
gbc,Gradient Boosting Classifier,0.8248,0.0,0.8248,0.827,0.8251,0.7198,0.7205,121.529
rf,Random Forest Classifier,0.8197,0.9347,0.8197,0.8223,0.8201,0.7116,0.7125,10.283
et,Extra Trees Classifier,0.8196,0.935,0.8196,0.8232,0.8203,0.7116,0.7126,10.275
ada,Ada Boost Classifier,0.8107,0.0,0.8107,0.8203,0.8141,0.7,0.7011,9.756
lr,Logistic Regression,0.808,0.0,0.808,0.8281,0.8147,0.6992,0.7022,21.389
ridge,Ridge Classifier,0.804,0.0,0.804,0.8289,0.8119,0.6941,0.6981,1.909
lda,Linear Discriminant Analysis,0.8021,0.0,0.8021,0.8304,0.811,0.6922,0.697,2.934
svm,SVM - Linear Kernel,0.7958,0.0,0.7958,0.8221,0.8041,0.6814,0.6858,4.001


In [15]:
# Entrenar el modelo base de LightGBM
trained_model = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8333,0.9425,0.8333,0.8326,0.8324,0.7326,0.7332
1,0.8212,0.9386,0.8212,0.8199,0.8191,0.7116,0.7132
2,0.8245,0.9425,0.8245,0.8239,0.8233,0.7179,0.7189
3,0.8307,0.9448,0.8307,0.8309,0.8298,0.728,0.7291
4,0.8344,0.9445,0.8344,0.8337,0.8331,0.7337,0.7348
5,0.8265,0.9416,0.8265,0.8277,0.8264,0.7222,0.7229
6,0.8286,0.9458,0.8286,0.8294,0.8284,0.7256,0.7262
7,0.8338,0.9423,0.8338,0.8331,0.8324,0.7327,0.7339
8,0.8323,0.9435,0.8323,0.8299,0.8299,0.7297,0.731
9,0.8172,0.9375,0.8172,0.8173,0.816,0.706,0.7073


In [16]:
# Definir la cuadrícula de parámetros para la búsqueda de hiperparámetros
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],  # Tasa de aprendizaje más baja
    'n_estimators': [50, 100, 200],      # Limitar el número de árboles
    'max_depth': [3, 4, 5],              # Limitar la profundidad máxima de los árboles
    'num_leaves': [20, 31, 40],          # Número de hojas más bajo para evitar árboles complejos
    'min_child_samples': [10, 20, 30],   # Mayor número de muestras por hoja
    'subsample': [0.7, 0.8, 0.9],        # Usar muestreo para reducir la variabilidad
    'colsample_bytree': [0.7, 0.8],      # Usar una fracción de características
    'reg_alpha': [0.1, 0.5, 1.0],        # Incrementar la regularización L1
    'reg_lambda': [0.1, 0.5, 1.0]       # Incrementar la regularización L2
}

# Ajuste del modelo con optimización bayesiana
tuned_model = tune_model(
    estimator=trained_model, 
    custom_grid=param_grid, 
    search_library='scikit-optimize', 
    search_algorithm='bayesian', 
    fold=5,  # Validación cruzada para evaluar mejor el modelo
    early_stopping=True,  # Detener el entrenamiento temprano si no mejora
    verbose=False
)

In [18]:
# Definir el nombre del archivo con el nombre del modelo para guardarlo
model_name = "lightgbm_try1"
file_name = f'{path}{model_name}_model.pkl'

# Guardar el modelo entrenado en un archivo con pickle
with open(file_name, 'wb') as model_file:
    pickle.dump(tuned_model, model_file)

print(f'Model {model_name} saved as {file_name}')

# Evaluar el modelo en el conjunto de prueba
predictions_test = predict_model(tuned_model, data=X_test)
predictions_train = predict_model(tuned_model, data=get_config('X_train'))

y_train = get_config('y_train')
y_test = get_config('y_test')

# Error de entrenamiento
train_accuracy = accuracy_score(y_train, predictions_train["prediction_label"])
print(f'Accuracy on training set for {model_name}: {train_accuracy}')

# Error de test
test_accuracy = accuracy_score(y_test, predictions_test["prediction_label"])
print(f'Accuracy on test set for {model_name}: {test_accuracy}')

# Finalizar el modelo
final_dt = finalize_model(tuned_model)

Model lightgbm_try1 saved as C:/Users/jesco/OneDrive - Universidad Santo Tomás/Documentos/Python/T3/lightgbm_try1_model.pkl


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8418,0.9517,0.8418,0.8418,0.8408,0.7457,0.7468


Accuracy on training set for lightgbm_try1: 0.8503790000373399
Accuracy on test set for lightgbm_try1: 0.37476041122146714


In [19]:
# Suponiendo que el conjunto de datos 'prueba' tiene las mismas columnas que X_test, excepto 'Target'
# Aplica las mismas transformaciones que se hicieron en X_train y X_test

# Codificación de variables categóricas en 'prueba'
prueba = rare_encoder.transform(prueba)
prueba = one_hot_encoder.transform(prueba)

# Crear características matemáticas en 'prueba'
prueba = math_transformer.transform(prueba)

# Crear características polinómicas en 'prueba'
prueba_poly = poly_transformer.transform(prueba[cuantitativas_bm])

# Concatenar las características polinómicas a 'prueba'
prueba = np.hstack([prueba, prueba_poly])

# Escalar las características en 'prueba'
prueba = scaler.transform(prueba)

# Convertir 'prueba' de vuelta a un DataFrame (sin la columna 'Target')
prueba_df = pd.DataFrame(prueba, columns=[f'feature_{i}' for i in range(prueba.shape[1])])

In [26]:
# Realizar predicciones usando el modelo final
# final_dt = load_model(file_name)  # Asegúrate de cargar tu modelo entrenado
predictions = predict_model(final_dt, data=prueba_df)

# Mapea las etiquetas predichas a sus nombres correspondientes
label_mapping = {0: 'Graduate', 1: 'Dropout', 2: 'Enrolled'}
result = pd.DataFrame({
    'id': prueba["id"],  # Asegúrate de que la columna 'id' esté en 'prueba'
    'Target': predictions['prediction_label'].map(label_mapping)
})

# Guardar el resultado en un archivo CSV con el nombre del modelo
result_file_name = f'{path}t3_{model_name}.csv'
result.to_csv(result_file_name, index=False, sep=",")

print(f'Result saved as {result_file_name}')

Result saved as C:/Users/jesco/OneDrive - Universidad Santo Tomás/Documentos/Python/T3/t3_lightgbm_try1.csv


In [None]:
# import pickle
# with open(path + 'LGBMClassifier_model_1.pkl', 'wb') as model_file:
#     pickle.dump(trained_model, model_file)

In [30]:
predictions['prediction_label']

0        0
1        2
2        2
3        1
4        1
        ..
51007    0
51008    0
51009    0
51010    0
51011    0
Name: prediction_label, Length: 51012, dtype: int32