# Setup

In [1]:
import pandas as pd
import os
import numpy as np
from scipy.io import arff
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Loading the data

In [2]:
#Get route of the path
current_path = os.getcwd()
aux_curr_path = current_path
project_path = aux_curr_path.replace('/notebooks', '')
dataset_path = "dataset/CEE_DATA.arff"
dataset_path = os.path.join(project_path, dataset_path)

data, meta = arff.loadarff(dataset_path)

df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) #Encoding from byte to string 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Performance           666 non-null    object
 1   Gender                666 non-null    object
 2   Caste                 666 non-null    object
 3   coaching              666 non-null    object
 4   time                  666 non-null    object
 5   Class_ten_education   666 non-null    object
 6   twelve_education      666 non-null    object
 7   medium                666 non-null    object
 8   Class_ X_Percentage   666 non-null    object
 9   Class_XII_Percentage  666 non-null    object
 10  Father_occupation     666 non-null    object
 11  Mother_occupation     666 non-null    object
dtypes: object(12)
memory usage: 62.6+ KB


            #0 : Average -  157
            #1 : Excellent - 101
            #2 : Good - 210
            #3 : Very Good - 198

# Preprocessing and Feature Engineering

In [3]:
columns_of_interest=["Performance",'Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']
updated_df=df[columns_of_interest]

In [4]:
y= updated_df[['Performance']]

### Coding y_label 

In [5]:
#Create oneHot enconder object
enc_OneHot = OneHotEncoder(sparse_output=False)

#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y)

#Create Label encoder object
ord_enc=LabelEncoder()

#Applying LabelEnconder to y
df["y_ord_enc"]=ord_enc.fit_transform(y)
y_Label = df["y_ord_enc"]


  y = column_or_1d(y, warn=True)


### Adding y_label coded to **updated_df**

In [6]:
updated_df = updated_df.assign(y_coding_col=y_Label.values)
updated_df.columns

Index(['Performance', 'Class_ X_Percentage', 'Class_XII_Percentage', 'medium',
       'Caste', 'y_coding_col'],
      dtype='object')

### Defining OneHot tranformer

In [7]:
#Columns to apply one hot enconder
col_X=['Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']

#Create the transformer
ct= ColumnTransformer(
    transformers=[
    ("OneHotInXColumns", enc_OneHot,col_X)
                      ]
)

## Strategy without treating the inbalance

In [8]:
cols2drop = ["Performance", 'y_coding_col']
X_no_balanced = updated_df.drop(cols2drop, axis=1)


y_no_balanced= updated_df[['Performance']]

### Coding X_no_balanced --OneHot--

In [9]:
# Applying OneHot to X
X_no_balanced_OneHot = ct.fit_transform(X_no_balanced)

### Coding y_no_balanced --OneHot--

In [10]:
y_OneHot_no_balanced = enc_OneHot.fit_transform(y_no_balanced)
y_Label_no_balanced = updated_df["y_coding_col"]

-----

## Strategy for inbalance multiclass

### OVER SAMPLING AND UNDERSAMPLING

### Creating new df per class 

In [11]:
df_av_class_0 = updated_df[updated_df["y_coding_col"]==0]
df_ex_class_1 = updated_df[updated_df["y_coding_col"]==1]
df_gd_class_2 = updated_df[updated_df["y_coding_col"]==2]
df_vg_class_3 = updated_df[updated_df["y_coding_col"]==3]

#### Ovesampling Class 0 and class 1

#### Undesampling Class 2 and Class 3


In [12]:
Number_of_samples = 180

#Oversampling
df_av_class_0_over =  df_av_class_0.sample(Number_of_samples, replace=True)
df_ex_class_1_over =  df_ex_class_1.sample(Number_of_samples, replace=True)

#Undersampling
df_gd_class_2_under = df_gd_class_2.sample(Number_of_samples)
df_vg_class_3_under = df_vg_class_3.sample(Number_of_samples)

df_mod_samples = pd.concat([df_av_class_0_over,
                            df_ex_class_1_over,
                            df_gd_class_2_under,
                            df_vg_class_3_under],axis=0) 

df_mod_samples.Performance.value_counts()

Performance
Average      180
Excellent    180
Good         180
Vg           180
Name: count, dtype: int64

### -X will called as X_balan (180 samples per class)

### -y will called as y_balan (180 samples per class)

In [13]:
cols2drop = ["Performance", 'y_coding_col']
X_balan = df_mod_samples.drop(cols2drop, axis=1)


y_balan = df_mod_samples[["Performance"]]

### Coding X_balan --OneHot--

In [14]:
# Applying OneHot to X
X_balan = ct.fit_transform(X_balan)

### Coding y_balan --OneHot--

In [15]:
#Applying OneHot
y_OneHot_balan = enc_OneHot.fit_transform(y_balan)

#Applying LabelEnconder to y
y_Label_balan = df_mod_samples["y_coding_col"]


----

### SMOTE

In [16]:
smote= SMOTE(sampling_strategy="minority")
X_sm, y_sm =  smote.fit_resample(X_no_balanced_OneHot,y_Label_no_balanced)

### Coding y_balan --OneHot--

In [24]:
y_sm_array = np.array(y_sm)
y_sm_df = pd.DataFrame(y_sm_array, columns=['label_smote_Performance'])

In [26]:
#Applying OneHot
y_sm_OneHot = enc_OneHot.fit_transform(y_sm_df)
y_sm_label=y_sm_df["label_smote_Performance"]

# Splitting the dataset

In [None]:
### BALANCED SPLIT - Over Sampling and Under Sampling 

#Split the dataset for y_OneHot_balan - Balanced applying over and under sampling  -- 720 samples, 180 samples per class
X_train_balan, X_test_balan, y_train_OneHot_balan, y_test_OneHot_balan = train_test_split(X_balan, y_OneHot_balan, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Pandas Series) --Not Balanced
X_train_balan, X_test_balan, y_train_Label_balan, y_test_Label_balan = train_test_split(X_balan, y_Label_balan, test_size=0.2, random_state=42)

### BALANCED SPLIT - SMOTE


###  --NOT BALANCED SPLIT--

#Split the dataset for y_OneHot -- Not balanced -- 666 samples 
X_train_no_balanced, X_test_no_balanced, y_train_OneHot_no_balanced, y_test_OneHot_no_balanced = train_test_split(X_no_balanced_OneHot, y_OneHot_no_balanced, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Pandas Series) --Not Balanced
X_train_no_balanced, X_test_no_balanced, y_train_Label_no_balanced, y_test_Label_no_balanced = train_test_split(X_no_balanced_OneHot, y_Label_no_balanced, test_size=0.2, random_state=42)



In [None]:

Experiments = [
    (
        "Random Forest n_estimators=100", 
        RandomForestClassifier(class_weight="balanced"),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),

    (
        "XGBoost",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
         "Multinomial Logistic Regression",
        LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
     (
        "K-Nearest Neighbors",
        KNeighborsClassifier(n_neighbors=5),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
    (
        "MLP",
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
        "Support Vector Classifier",
        SVC(kernel='linear', probability=True),  
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    )
    
    
]


In [None]:
results_per_model = []

for model_name, model, train_set, test_set in Experiments:
    X_train = train_set[0] #get Xtrain from the list models
    y_train = train_set[1] #get y_train from list models
    X_test = test_set[0]    #get x_test from list models 
    y_test = test_set[1]   #get y_test from list models
    
    model.fit(X_train, y_train)  #train the current model
    y_pred = model.predict(X_test) #make predictions 
    report = classification_report(y_test, y_pred, output_dict=True) #make a dict of the classification report
    
    
    results_per_model.append(report) #add the previus dict to a list

# Publish experiements to server 

In [None]:
"""""
mlflow.set_experiment("Student Performance Analysis Model")
mlflow.set_tracking_uri("http://3.84.228.208:5000")

for i, element in enumerate(Experiments):
    model_name = element[0]
    model = element[1]
    report = results_per_model[i]
    
    with mlflow.start_run(run_name=model_name):        
            mlflow.log_param("model", model_name)
            
            
            # -------------Class interpretation---------------- 
            #0 : Average
            #1 : Excellent
            #2 : Good
            #3 : Very Good

            #Metrics of class 0
            
            mlflow.log_metric('acurracy_class_0', report['0']['precision'])
            mlflow.log_metric('recall_class_0', report['0']['recall'])
            mlflow.log_metric('f1_class_0', report['0']['f1-score'])
            
            #Metrics of class 1
             
            mlflow.log_metric('acurracy_class_1', report['1']['precision'])
            mlflow.log_metric('recall_class_1', report['1']['recall'])
            mlflow.log_metric('f1_class_1', report['1']['f1-score'])
            
            #Metrics of class 2
            
            mlflow.log_metric('acurracy_class_2', report['2']['precision'])
            mlflow.log_metric('recall_class_2', report['2']['recall'])
            mlflow.log_metric('f1_class_2', report['2']['f1-score'])
            
            #Metrics of class 3
            
            mlflow.log_metric('acurracy_class_3', report['3']['precision'])
            mlflow.log_metric('recall_class_3', report['3']['recall'])
            mlflow.log_metric('f1_class_3', report['3']['f1-score'])
            
        
            if "XGB" in model_name:
                mlflow.xgboost.log_model(model, "model")
            else:
                mlflow.sklearn.log_model(model, "model") 
                
    """