# Setup

In [29]:
import pandas as pd
import os
import numpy as np
from scipy.io import arff
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Loading the data

In [30]:
#Get route of the path
current_path = os.getcwd()
aux_curr_path = current_path
project_path = aux_curr_path.replace('/notebooks', '')
dataset_path = "dataset/CEE_DATA.arff"
dataset_path = os.path.join(project_path, dataset_path)

data, meta = arff.loadarff(dataset_path)

df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) #Encoding from byte to string 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Performance           666 non-null    object
 1   Gender                666 non-null    object
 2   Caste                 666 non-null    object
 3   coaching              666 non-null    object
 4   time                  666 non-null    object
 5   Class_ten_education   666 non-null    object
 6   twelve_education      666 non-null    object
 7   medium                666 non-null    object
 8   Class_ X_Percentage   666 non-null    object
 9   Class_XII_Percentage  666 non-null    object
 10  Father_occupation     666 non-null    object
 11  Mother_occupation     666 non-null    object
dtypes: object(12)
memory usage: 62.6+ KB


            #0 : Average -  157
            #1 : Excellent - 101
            #2 : Good - 210
            #3 : Very Good - 198

# Preprocessing and Feature Engineering

In [31]:
columns_of_interest=["Performance",'Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']
updated_df=df[columns_of_interest]

In [32]:
y= updated_df[['Performance']]

In [33]:
#Create oneHot enconder object
enc_OneHot = OneHotEncoder(sparse_output=False)

#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y)

#Create Label encoder object
ord_enc=LabelEncoder()

#Applying LabelEnconder to y
df["y_ord_enc"]=ord_enc.fit_transform(y)
y_Label = df["y_ord_enc"]


  y = column_or_1d(y, warn=True)


In [34]:
updated_df = updated_df.assign(y_coding_col=y_Label.values)
updated_df

Unnamed: 0,Performance,Class_ X_Percentage,Class_XII_Percentage,medium,Caste,y_coding_col
0,Excellent,Excellent,Excellent,ENGLISH,General,1
1,Excellent,Excellent,Excellent,OTHERS,OBC,1
2,Excellent,Excellent,Excellent,ENGLISH,OBC,1
3,Excellent,Excellent,Excellent,OTHERS,General,1
4,Excellent,Excellent,Excellent,ENGLISH,General,1
...,...,...,...,...,...,...
661,Average,Good,Vg,ENGLISH,ST,0
662,Average,Vg,Good,ENGLISH,ST,0
663,Average,Good,Vg,ENGLISH,ST,0
664,Average,Good,Good,ENGLISH,ST,0


In [35]:
df_av_class_0 = updated_df[updated_df["y_coding_col"]==0]
df_ex_class_1 = updated_df[updated_df["y_coding_col"]==1]
df_gd_class_2 = updated_df[updated_df["y_coding_col"]==2]
df_vg_class_3 = updated_df[updated_df["y_coding_col"]==3]

In [36]:
updated_df.y_coding_col.value_counts()

y_coding_col
2    210
3    198
0    157
1    101
Name: count, dtype: int64

--------

In [37]:
Number_of_samples = 180

#Oversampling
df_av_class_0_over =  df_av_class_0.sample(Number_of_samples, replace=True)
df_ex_class_1_over =  df_ex_class_1.sample(Number_of_samples, replace=True)

#Undersampling
df_gd_class_2_under = df_gd_class_2.sample(Number_of_samples)
df_vg_class_3_under = df_vg_class_3.sample(Number_of_samples)

df_mod_samples = pd.concat([df_av_class_0_over,
                            df_ex_class_1_over,
                            df_gd_class_2_under,
                            df_vg_class_3_under],axis=0) 

df_mod_samples.Performance.value_counts()

Performance
Average      180
Excellent    180
Good         180
Vg           180
Name: count, dtype: int64

In [38]:
df_mod_samples.head()

Unnamed: 0,Performance,Class_ X_Percentage,Class_XII_Percentage,medium,Caste,y_coding_col
600,Average,Excellent,Vg,ENGLISH,ST,0
571,Average,Excellent,Excellent,ENGLISH,ST,0
581,Average,Excellent,Excellent,ENGLISH,ST,0
538,Average,Vg,Vg,ENGLISH,SC,0
567,Average,Excellent,Excellent,ENGLISH,ST,0


------

In [39]:
df_mod_samples.columns

Index(['Performance', 'Class_ X_Percentage', 'Class_XII_Percentage', 'medium',
       'Caste', 'y_coding_col'],
      dtype='object')

In [40]:
cols2drop = ["Performance", 'y_coding_col']
X = df_mod_samples.drop(cols2drop, axis=1)


y = df_mod_samples[["Performance"]]

In [41]:
X

Unnamed: 0,Class_ X_Percentage,Class_XII_Percentage,medium,Caste
600,Excellent,Vg,ENGLISH,ST
571,Excellent,Excellent,ENGLISH,ST
581,Excellent,Excellent,ENGLISH,ST
538,Vg,Vg,ENGLISH,SC
567,Excellent,Excellent,ENGLISH,ST
...,...,...,...,...
260,Excellent,Vg,ASSAMESE,General
132,Excellent,Excellent,ENGLISH,General
109,Excellent,Excellent,ENGLISH,General
284,Excellent,Vg,ENGLISH,General


In [42]:
y

Unnamed: 0,Performance
600,Average
571,Average
581,Average
538,Average
567,Average
...,...
260,Vg
132,Vg
109,Vg
284,Vg


In [43]:
#Columns to apply one hot enconder
col_X=['Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']

#Create the transformer
ct= ColumnTransformer(
    transformers=[
    ("OneHotInXColumns", enc_OneHot,col_X)
                      ]
)

In [44]:
# Applying OneHot to X
X = ct.fit_transform(X)

In [45]:
X

array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [46]:
y

Unnamed: 0,Performance
600,Average
571,Average
581,Average
538,Average
567,Average
...,...
260,Vg
132,Vg
109,Vg
284,Vg


In [47]:
y_Label

0      1
1      1
2      1
3      1
4      1
      ..
661    0
662    0
663    0
664    0
665    0
Name: y_ord_enc, Length: 666, dtype: int64

In [48]:
#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y)

#Get Label enconder
y_Label = df_mod_samples["y_coding_col"]

In [49]:
y_OneHot

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [50]:
y_Label

600    0
571    0
581    0
538    0
567    0
      ..
260    3
132    3
109    3
284    3
185    3
Name: y_coding_col, Length: 720, dtype: int64

-------

# Splitting the dataset

In [51]:
#Split the dataset for y_OneHot
X_train, X_test, y_train_OneHot, y_test_OneHot = train_test_split(X, y_OneHot, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Pandas Series)
X_train, X_test, y_train_Label, y_test_Label = train_test_split(X, y_Label, test_size=0.2, random_state=42)



In [52]:

Experiments = [
    (
        "RF -OverS", 
        RandomForestClassifier(class_weight="balanced"),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),

    (
        "XGBoost -OverS",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
         "Logistic Regression-OverS",
        LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
     (
        "K-Nearest Neighbors-OverS",
        KNeighborsClassifier(n_neighbors=5),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
    (
        "MLP-OverS",
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
        "SVC-OS",
        SVC(kernel='linear', probability=True),  
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    )
    
    
]


In [53]:
results_per_model = []

for model_name, model, train_set, test_set in Experiments:
    X_train = train_set[0] #get Xtrain from the list models
    y_train = train_set[1] #get y_train from list models
    X_test = test_set[0]    #get x_test from list models 
    y_test = test_set[1]   #get y_test from list models
    
    model.fit(X_train, y_train)  #train the current model
    y_pred = model.predict(X_test) #make predictions 
    report = classification_report(y_test, y_pred, output_dict=True) #make a dict of the classification report
    
    
    results_per_model.append(report) #add the previus dict to a list

  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
mlflow.set_experiment("Analysis Model Jesus")
mlflow.set_tracking_uri("http://3.84.228.208:5000")

for i, element in enumerate(Experiments):
    model_name = element[0]
    model = element[1]
    report = results_per_model[i]
    
    with mlflow.start_run(run_name=model_name):        
            mlflow.log_param("model", model_name)
            
            
            # -------------Class interpretation---------------- 
            #0 : Average
            #1 : Excellent
            #2 : Good
            #3 : Very Good

            #Metrics of class 0
            
            mlflow.log_metric('acurracy_class_0', report['0']['precision'])
            mlflow.log_metric('recall_class_0', report['0']['recall'])
            mlflow.log_metric('f1_class_0', report['0']['f1-score'])
            
            #Metrics of class 1
             
            mlflow.log_metric('acurracy_class_1', report['1']['precision'])
            mlflow.log_metric('recall_class_1', report['1']['recall'])
            mlflow.log_metric('f1_class_1', report['1']['f1-score'])
            
            #Metrics of class 2
            
            mlflow.log_metric('acurracy_class_2', report['2']['precision'])
            mlflow.log_metric('recall_class_2', report['2']['recall'])
            mlflow.log_metric('f1_class_2', report['2']['f1-score'])
            
            #Metrics of class 3
            
            mlflow.log_metric('acurracy_class_3', report['3']['precision'])
            mlflow.log_metric('recall_class_3', report['3']['recall'])
            mlflow.log_metric('f1_class_3', report['3']['f1-score'])
            
        
            if "XGB" in model_name:
                mlflow.xgboost.log_model(model, "model")
            else:
                mlflow.sklearn.log_model(model, "model") 

2024/10/20 08:45:27 INFO mlflow.tracking.fluent: Experiment with name 'Analysis Model Jesus' does not exist. Creating a new experiment.
2024/10/20 08:45:42 INFO mlflow.tracking._tracking_service.client: üèÉ View run RF -OverS at: http://3.84.228.208:5000/#/experiments/398844904523661981/runs/46fc3865c5994a6abbcb695bed5ffa74.
2024/10/20 08:45:42 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://3.84.228.208:5000/#/experiments/398844904523661981.
2024/10/20 08:45:52 INFO mlflow.tracking._tracking_service.client: üèÉ View run XGBoost -OverS at: http://3.84.228.208:5000/#/experiments/398844904523661981/runs/670b17505518427e997f61d4433d77a6.
2024/10/20 08:45:52 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://3.84.228.208:5000/#/experiments/398844904523661981.
2024/10/20 08:45:59 INFO mlflow.tracking._tracking_service.client: üèÉ View run Logistic Regression-OverS at: http://3.84.228.208:5000/#/experiments/398844904523661981/runs