# Setup

In [124]:
import pandas as pd
import os
import numpy as np
from scipy.io import arff
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Loading the data

In [125]:
#Get route of the path
current_path = os.getcwd()
aux_curr_path = current_path
project_path = aux_curr_path.replace('/notebooks', '')
dataset_path = "dataset/CEE_DATA.arff"
dataset_path = os.path.join(project_path, dataset_path)

data, meta = arff.loadarff(dataset_path)

df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) #Encoding from byte to string 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Performance           666 non-null    object
 1   Gender                666 non-null    object
 2   Caste                 666 non-null    object
 3   coaching              666 non-null    object
 4   time                  666 non-null    object
 5   Class_ten_education   666 non-null    object
 6   twelve_education      666 non-null    object
 7   medium                666 non-null    object
 8   Class_ X_Percentage   666 non-null    object
 9   Class_XII_Percentage  666 non-null    object
 10  Father_occupation     666 non-null    object
 11  Mother_occupation     666 non-null    object
dtypes: object(12)
memory usage: 62.6+ KB


            #0 : Average -  157
            #1 : Excellent - 101
            #2 : Good - 210
            #3 : Very Good - 198

# Preprocessing and Feature Engineering

In [126]:
columns_of_interest=["Performance",'Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']
updated_df=df[columns_of_interest]

In [127]:
X=updated_df.drop('Performance',axis=1)
y= updated_df[['Performance']]

In [119]:
X

Unnamed: 0,Class_ X_Percentage,Class_XII_Percentage,medium,Caste
0,Excellent,Excellent,ENGLISH,General
1,Excellent,Excellent,OTHERS,OBC
2,Excellent,Excellent,ENGLISH,OBC
3,Excellent,Excellent,OTHERS,General
4,Excellent,Excellent,ENGLISH,General
...,...,...,...,...
661,Good,Vg,ENGLISH,ST
662,Vg,Good,ENGLISH,ST
663,Good,Vg,ENGLISH,ST
664,Good,Good,ENGLISH,ST


In [120]:
y

Unnamed: 0,Performance
0,Excellent
1,Excellent
2,Excellent
3,Excellent
4,Excellent
...,...
661,Average
662,Average
663,Average
664,Average


In [121]:
#Create oneHot enconder object
enc_OneHot = OneHotEncoder(sparse_output=False)

#Columns to apply one hot enconder
col_X=['Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']


#Create the transformer
ct= ColumnTransformer(
    transformers=[
    ("OneHotInXColumns", enc_OneHot,col_X)
                      ]
)

#Create Label encoder object
ord_enc=LabelEncoder()


In [123]:
y

Unnamed: 0,Performance
0,Excellent
1,Excellent
2,Excellent
3,Excellent
4,Excellent
...,...
661,Average
662,Average
663,Average
664,Average


In [122]:
# Applying OneHot to X
X = ct.fit_transform(X)
#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y)

#Applying LabelEnconder to y
df["y_ord_enc"]=ord_enc.fit_transform(y)
y_Label = df["y_ord_enc"]
y_Label_array= np.array(y_Label)


  y = column_or_1d(y, warn=True)


# Splitting the dataset

In [9]:
#Split the dataset for y_OneHot
X_train, X_test, y_train_OneHot, y_test_OneHot = train_test_split(X, y_OneHot, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Pandas Series)
X_train, X_test, y_train_Label, y_test_Label = train_test_split(X, y_Label, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Numpy array)
X_train, X_test, y_train_Label_array, y_test_Label_array = train_test_split(X, y_Label_array, test_size=0.2, random_state=42)

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate

In [11]:
model = DecisionTreeClassifier()

# Define SMOTE con `sampling_strategy` como un diccionario
over = SMOTE(sampling_strategy={0: 178, 1: 178})  # Ajusta los valores para tus clases
under = RandomUnderSampler(sampling_strategy={2: 178, 3: 178})  # Ajusta los valores de undersampling

# Definir el pipeline con SMOTE y RandomUnderSampler
steps = [('oversampling', over), ('undersampling', under), ('classifier', model)]
pipeline = Pipeline(steps=steps)

# Definir validación cruzada
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluar pipeline
scores = cross_validate(pipeline, X, y_Label_array, scoring='accuracy', cv=cv, n_jobs=-1)

print(scores)


{'fit_time': array([0.02185392, 0.0173769 , 0.02888012, 0.01741004, 0.0235858 ,
       0.02088809, 0.0159409 , 0.012887  , 0.0205121 , 0.02207804,
       0.01799798, 0.0170567 , 0.01220107, 0.02069402, 0.02823997,
       0.01292682, 0.03099704, 0.02396703, 0.01617098, 0.01355076,
       0.02585125, 0.0162251 , 0.0170629 , 0.0155921 , 0.01043391,
       0.01446772, 0.020715  , 0.01673985, 0.01438403, 0.01128101]), 'score_time': array([0.00606585, 0.00291109, 0.0012691 , 0.00248981, 0.00097394,
       0.00160193, 0.00100493, 0.00103617, 0.00206995, 0.00127316,
       0.00144291, 0.00101805, 0.00265217, 0.007792  , 0.00066495,
       0.00281715, 0.00123906, 0.00111103, 0.00104189, 0.00662613,
       0.00104761, 0.00094891, 0.00334692, 0.00106525, 0.00191307,
       0.00100923, 0.00159287, 0.00098014, 0.00082397, 0.00066113]), 'test_score': array([0.40298507, 0.43283582, 0.46268657, 0.49253731, 0.37313433,
       0.41791045, 0.43939394, 0.51515152, 0.40909091, 0.45454545,
       0.52238806

In [115]:
y_Label.value_counts()

y_ord_enc
2    210
3    198
0    157
1    101
Name: count, dtype: int64

In [114]:
model = DecisionTreeClassifier()

# Definir la estrategia de oversampling para las clases minoritarias
over = SMOTE(sampling_strategy={2: 180})  # Sobremuestrear las clases 0 y 1 a 150 ejemplos

# Definir la estrategia de undersampling para las clases mayoritarias
under = RandomUnderSampler(sampling_strategy={1:200})  # Submuestrear las clases 2 y 3 a 150 ejemplos

#Maximos 
#0-126,
#1-74
#2-176
#3-156-200?

# Crear el pipeline
pipeline = Pipeline(steps=[('oversampling', over), ('undersampling', under), ('classifier', model)])

# Entrenar el modelo
pipeline.fit(X_train, y_train_Label_array)

# Hacer predicciones con el conjunto de prueba
y_pred = pipeline.predict(X_test)
print("Reporte de clasificación:\n", classification_report(y_test_Label_array, y_pred, target_names=["Clase 0", "Clase 1", "Clase 2", "Clase 3"]))

precision_macro = precision_score(y_test_Label_array, y_pred, average='macro')
precision_weighted = precision_score(y_test_Label_array, y_pred, average='weighted')

print(f"Precisión promedio (macro): {precision_macro:.4f}")
print(f"Precisión promedio (weighted): {precision_weighted:.4f}")

ValueError: With under-sampling methods, the number of samples in a class should be less or equal to the original number of samples. Originally, there is 74 samples and 200 samples are asked.

In [19]:
from sklearn.metrics import classification_report

# Suponiendo que ya tienes el modelo entrenado y las predicciones
y_pred = pipeline.predict(X_test)

# Generar el reporte de clasificación
report = classification_report(y_test, y_pred, target_names=["Clase 0", "Clase 1", "Clase 2", "Clase 3"])

print(report)

NameError: name 'X_test' is not defined

In [None]:
# Define el modelo y el pipeline con SMOTE y RandomUnderSampler
model = DecisionTreeClassifier()
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('oversampling', over), ('undersampling', under), ('MyClassifier', model)]
pipeline = Pipeline(steps=steps)

# Define el esquema de validación cruzada
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evalúa el pipeline usando roc_auc_ovr para problemas multiclase
scores = cross_validate(pipeline, X, y, scoring='roc_auc_ovr', cv=cv, n_jobs=-1)

print(scores)

In [None]:

Experiments = [
    (
        "Random Forest n_estimators=100", 
        RandomForestClassifier(class_weight="balanced"),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),

    (
        "XGBoost",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
         "Multinomial Logistic Regression",
        LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
     (
        "K-Nearest Neighbors",
        KNeighborsClassifier(n_neighbors=5),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
    (
        "MLP",
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
        "Support Vector Classifier",
        SVC(kernel='linear', probability=True),  
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    )
    
    
]


In [None]:
results_per_model = []

for model_name, model, train_set, test_set in Experiments:
    X_train = train_set[0] #get Xtrain from the list models
    y_train = train_set[1] #get y_train from list models
    X_test = test_set[0]    #get x_test from list models 
    y_test = test_set[1]   #get y_test from list models
    
    model.fit(X_train, y_train)  #train the current model
    y_pred = model.predict(X_test) #make predictions 
    report = classification_report(y_test, y_pred, output_dict=True) #make a dict of the classification report
    
    
    results_per_model.append(report) #add the previus dict to a list

# Publish experiements to server 

In [None]:
"""""
mlflow.set_experiment("Student Performance Analysis Model")
mlflow.set_tracking_uri("http://3.84.228.208:5000")

for i, element in enumerate(Experiments):
    model_name = element[0]
    model = element[1]
    report = results_per_model[i]
    
    with mlflow.start_run(run_name=model_name):        
            mlflow.log_param("model", model_name)
            
            
            # -------------Class interpretation---------------- 
            #0 : Average
            #1 : Excellent
            #2 : Good
            #3 : Very Good

            #Metrics of class 0
            
            mlflow.log_metric('acurracy_class_0', report['0']['precision'])
            mlflow.log_metric('recall_class_0', report['0']['recall'])
            mlflow.log_metric('f1_class_0', report['0']['f1-score'])
            
            #Metrics of class 1
             
            mlflow.log_metric('acurracy_class_1', report['1']['precision'])
            mlflow.log_metric('recall_class_1', report['1']['recall'])
            mlflow.log_metric('f1_class_1', report['1']['f1-score'])
            
            #Metrics of class 2
            
            mlflow.log_metric('acurracy_class_2', report['2']['precision'])
            mlflow.log_metric('recall_class_2', report['2']['recall'])
            mlflow.log_metric('f1_class_2', report['2']['f1-score'])
            
            #Metrics of class 3
            
            mlflow.log_metric('acurracy_class_3', report['3']['precision'])
            mlflow.log_metric('recall_class_3', report['3']['recall'])
            mlflow.log_metric('f1_class_3', report['3']['f1-score'])
            
        
            if "XGB" in model_name:
                mlflow.xgboost.log_model(model, "model")
            else:
                mlflow.sklearn.log_model(model, "model") 
                
    """