# *PROJECT TASK*

The goal of practical project is to explain, using all techniques explained in class, the ML
models assigned.
At least each team must explain 2 ML models (a.k.a. compulsory models). Additionally,
the team can build and explain a third model (a.k.a. extra model).

The Practical Project will consist of:

- Python 2 or 3 notebooks: ipynb file for each model.
- Dataset: csv file or online load hardcoded.

Each notebook must include:

- Markdown use:
    - Sections and subsections numbered (using #, ##, etc.)
    - Concise and clear descriptions of the decisions made
    - Concise and clear conclusions of the explanations obtained.

- The code must be clear and must include comments.
- The notebook must include 3 sections (you can add subsections):

1. Dataset load and preparation
2. ML model training
3. ML explanation

The ML explanation, section 3, is the most important part of the project. Here you should
cover:

- Use any explanation method useful that you’ve seen on class.
- Provide individual explanations of instance. Not just plots, but the conclusions
you can make.
- Provide global explanations of the ML model. Not just plots, but the conclusions
you can make.

# Machine Learning Explicable. Explicación de un MODEL_TYPE.

## 0. Requerimientos y funciones auxiliares

In [3]:
# Importación de librerías
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import os
import sys
import datetime

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#from skopt import BayesSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor

from sklearn.preprocessing import OneHotEncoder
import sklearn.impute as impute

from sklearn import metrics
from scipy import stats

# Regresión

# Preprocesamiento
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from sklearn.decomposition import PCA

# Desbalanceo
#from imblearn.over_sampling import SVMSMOTE, RandomOverSampler, ADASYN
#from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour
#from imblearn.ensemble import RUSBoostClassifier
#from sklearn.svm import OneClassSVM

# Modelos de clasificación

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBRegressor
import joblib

# TODO: Explicabilidad: SHAP, LIME


In [10]:
!git clone https://github.com/JMGO-coding/MLX_academic_performance.git
!cd MLX_academic_performance/

Cloning into 'MLX_academic_performance'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 33 (delta 7), reused 4 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (33/33), 234.43 KiB | 1.17 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [12]:
import sys

# Añadir los directorio fuentes al path de Python
sys.path.append('/content/MLX_academic_performance')
sys.path.append('/content/MLX_academic_performance/src')

# Verificar que se han añadido correctamente
print(sys.path)

['/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/root/.ipython', '/content/k_brazos_GGM', '/content/k_brazos_GGM/src', '/content/MLX_academic_performance', '/content/MLX_academic_performance/src']


In [4]:
def save_model(model, model_path="models/best_model_DT.pkl"):
    """
    Guarda un modelo un archivo pkl.
    """
    joblib.dump(model, model_path)

In [5]:
def load_model(model_path="models/best_model_DT.pkl"):
    """
    Carga un modelo previamente guardado desde un archivo.
    """
    return joblib.load(model_path)

## 1. Carga y preprocesamiento del dataset

### 1.1. Primeras visualizaciones y estadísticas

In [8]:
# Fijamos la semilla y los directorios

SEED = 2024
directory = '../'
plots_directory = directory + 'plots/'
data_directory = directory + 'data/'

In [9]:
# Lectura del Dataset y primera visualziación

data_file = 'data.csv'
data_path = data_directory + data_file
df= pd.read_csv(data_path, sep=';')
print(df.shape)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/data.csv'

In [None]:
df.info()   # Conteo de no-nulos y data types

In [None]:
df.describe()   # Estadísticos básicos sobre los features numéricos del dataset

El dataset no contiene valores nulos.

In [None]:
# Variables según su tipo

categorical_features = ['Marital status', 'Application mode', 'Course', 'Nacionality', 'Mother\'s occupation', 'Father\'s occupation']

numerical_features = ['Application order', 'Previous qualification', 'Previous qualification (grade)',
                      'Mother\'s qualification',  'Father\'s qualification', 'Admission grade', 'Age at enrollment',
                      'Curricular units 1st sem (credited)',  'Curricular units 1st sem (enrolled)',  'Curricular units 1st sem (evaluations)',
                      'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)',  'Curricular units 1st sem (without evaluations)',
                      'Curricular units 2st sem (credited)',  'Curricular units 2st sem (enrolled)',  'Curricular units 2st sem (evaluations)',
                      'Curricular units 2st sem (approved)', 'Curricular units 2st sem (grade)',  'Curricular units 2st sem (without evaluations)',
                      'Unemployment rate', 'Inflation rate', 'GDP'
                    ]
binary_features = ['Daytime/evening attendance\t', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender',
                   'Scholarship holder', 'International']

target_feature = 'Target' # Categorical 3 classes

In [None]:
# Histograma variables categóricas

for feature in [target_feature] + categorical_features + binary_features:
    sns.countplot(y=feature, data=df)
    plt.show()

### 1.2. partición de los datos en *Train* y *Test*

In [None]:
X = df.drop(columns=['Target'])
y = df['Target']
test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=SEED)

In [None]:
# Ver distribución de las variables predictoras numéricas (no la objetivo) en diagrama de cajas
plt.figure(figsize=(20, 20))
# Boxplot of dftrain except SalePrice
dftrain.boxplot(column=list(numerical_features), rot=90)
plt.xticks(rotation=90)
plt.show()


## 2. Entrenamiento del modelo (Decision Tree)

In [None]:
# Ejecutamos un Grid search y nos quedamos con el modelod e mejores hiperparámetros

param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 7, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }

dt = DecisionTreeClassifier(random_state=SEED)
grid_search = GridSearchCV(dt, param_grid, scoring='f1', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred)

print("Mejores parámetros:", grid_search.best_params_)
print("F1-score en test:", f1)

## 3. Explicación del modelo