# Setup

In [2]:
import pandas as pd
import os
import numpy as np
from scipy.io import arff
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Loading the data

In [3]:
#Get route of the path
current_path = os.getcwd()
aux_curr_path = current_path
project_path = aux_curr_path.replace('/notebooks', '')
dataset_path = "dataset/CEE_DATA.arff"
dataset_path = os.path.join(project_path, dataset_path)

data, meta = arff.loadarff(dataset_path)

df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) #Encoding from byte to string 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Performance           666 non-null    object
 1   Gender                666 non-null    object
 2   Caste                 666 non-null    object
 3   coaching              666 non-null    object
 4   time                  666 non-null    object
 5   Class_ten_education   666 non-null    object
 6   twelve_education      666 non-null    object
 7   medium                666 non-null    object
 8   Class_ X_Percentage   666 non-null    object
 9   Class_XII_Percentage  666 non-null    object
 10  Father_occupation     666 non-null    object
 11  Mother_occupation     666 non-null    object
dtypes: object(12)
memory usage: 62.6+ KB


            #0 : Average -  157
            #1 : Excellent - 101
            #2 : Good - 210
            #3 : Very Good - 198

# Preprocessing and Feature Engineering

In [4]:
columns_of_interest=["Performance",'Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']
updated_df=df[columns_of_interest]

In [5]:
y= updated_df[['Performance']]

In [6]:
#Create oneHot enconder object
enc_OneHot = OneHotEncoder(sparse_output=False)

#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y)

#Create Label encoder object
ord_enc=LabelEncoder()

#Applying LabelEnconder to y
df["y_ord_enc"]=ord_enc.fit_transform(y)
y_Label = df["y_ord_enc"]


  y = column_or_1d(y, warn=True)


In [7]:
updated_df = updated_df.assign(y_coding_col=y_Label.values)
updated_df

Unnamed: 0,Performance,Class_ X_Percentage,Class_XII_Percentage,medium,Caste,y_coding_col
0,Excellent,Excellent,Excellent,ENGLISH,General,1
1,Excellent,Excellent,Excellent,OTHERS,OBC,1
2,Excellent,Excellent,Excellent,ENGLISH,OBC,1
3,Excellent,Excellent,Excellent,OTHERS,General,1
4,Excellent,Excellent,Excellent,ENGLISH,General,1
...,...,...,...,...,...,...
661,Average,Good,Vg,ENGLISH,ST,0
662,Average,Vg,Good,ENGLISH,ST,0
663,Average,Good,Vg,ENGLISH,ST,0
664,Average,Good,Good,ENGLISH,ST,0


In [8]:
updated_df.y_coding_col.value_counts()

y_coding_col
2    210
3    198
0    157
1    101
Name: count, dtype: int64

--------

In [10]:
cols2drop = ["Performance", 'y_coding_col']
X = updated_df.drop(cols2drop, axis=1)


y = updated_df[["y_coding_col"]]

In [13]:
#Columns to apply one hot enconder
col_X=['Class_ X_Percentage', 'Class_XII_Percentage', 'medium', 'Caste']

#Create the transformer
ct= ColumnTransformer(
    transformers=[
    ("OneHotInXColumns", enc_OneHot,col_X)
                      ]
)

# Applying OneHot to X
X = ct.fit_transform(X)

In [14]:
X

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
y

Unnamed: 0,y_coding_col
0,1
1,1
2,1
3,1
4,1
...,...
661,0
662,0
663,0
664,0


In [15]:
y.value_counts()

y_coding_col
2               210
3               198
0               157
1               101
Name: count, dtype: int64

In [18]:
from imblearn.over_sampling import SMOTE

smote= SMOTE(sampling_strategy="minority")
X_sm, y_sm =  smote.fit_resample(X,y)

In [19]:
y_sm.value_counts()

y_coding_col
1               210
2               210
3               198
0               157
Name: count, dtype: int64

In [27]:
y_sm

Unnamed: 0,y_coding_col
0,1
1,1
2,1
3,1
4,1
...,...
770,1
771,1
772,1
773,1


In [28]:
#Applying OneHot
y_OneHot = enc_OneHot.fit_transform(y_sm)

#Get Label enconder
y_Label = y_sm

In [32]:
len(y_OneHot)

775

In [30]:
y_Label

Unnamed: 0,y_coding_col
0,1
1,1
2,1
3,1
4,1
...,...
770,1
771,1
772,1
773,1


-------

# Splitting the dataset

In [33]:
#Split the dataset for y_OneHot
X_train, X_test, y_train_OneHot, y_test_OneHot = train_test_split(X_sm, y_OneHot, test_size=0.2, random_state=42)

#Split the dataset for y_Label (Pandas Series)
X_train, X_test, y_train_Label, y_test_Label = train_test_split(X_sm, y_Label, test_size=0.2, random_state=42)



In [38]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train_OneHot)
y_pred = clf.predict(X_test)



print(classification_report(y_test_OneHot, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.62      0.74        32
           1       0.47      0.49      0.48        49
           2       0.25      0.10      0.14        41
           3       0.33      0.15      0.21        33

   micro avg       0.51      0.34      0.41       155
   macro avg       0.49      0.34      0.39       155
weighted avg       0.47      0.34      0.39       155
 samples avg       0.34      0.34      0.34       155



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:

Experiments = [
    (
        "Random Forest n_estimators=100", 
        RandomForestClassifier(class_weight="balanced"),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),

    (
        "XGBoost",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
         "Multinomial Logistic Regression",
        LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
     (
        "K-Nearest Neighbors",
        KNeighborsClassifier(n_neighbors=5),
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    ),
    
    (
        "MLP",
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam'),
        (X_train, y_train_OneHot),
        (X_test, y_test_OneHot)
    ),
    
    (
        "Support Vector Classifier",
        SVC(kernel='linear', probability=True),  
        (X_train, y_train_Label),
        (X_test, y_test_Label)
    )
    
    
]


In [27]:
results_per_model = []

for model_name, model, train_set, test_set in Experiments:
    X_train = train_set[0] #get Xtrain from the list models
    y_train = train_set[1] #get y_train from list models
    X_test = test_set[0]    #get x_test from list models 
    y_test = test_set[1]   #get y_test from list models
    
    model.fit(X_train, y_train)  #train the current model
    y_pred = model.predict(X_test) #make predictions 
    report = classification_report(y_test, y_pred, output_dict=True) #make a dict of the classification report
    
    
    results_per_model.append(report) #add the previus dict to a list

  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
results_per_model

[{'0': {'precision': 0.9428571428571428,
   'recall': 0.825,
   'f1-score': 0.88,
   'support': 40},
  '1': {'precision': 0.6666666666666666,
   'recall': 0.09523809523809523,
   'f1-score': 0.16666666666666666,
   'support': 42},
  '2': {'precision': 0.6666666666666666,
   'recall': 0.06451612903225806,
   'f1-score': 0.1176470588235294,
   'support': 31},
  '3': {'precision': 0.3333333333333333,
   'recall': 0.03225806451612903,
   'f1-score': 0.0588235294117647,
   'support': 31},
  'micro avg': {'precision': 0.851063829787234,
   'recall': 0.2777777777777778,
   'f1-score': 0.418848167539267,
   'support': 144},
  'macro avg': {'precision': 0.6523809523809524,
   'recall': 0.2542530721966206,
   'f1-score': 0.3057843137254902,
   'support': 144},
  'weighted avg': {'precision': 0.6716269841269841,
   'recall': 0.2777777777777778,
   'f1-score': 0.3310457516339869,
   'support': 144},
  'samples avg': {'precision': 0.2777777777777778,
   'recall': 0.2777777777777778,
   'f1-score': 

In [29]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train_OneHot)
y_pred = clf.predict(X_test)



print(classification_report(y_test_OneHot, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.67      0.10      0.17        42
           2       0.67      0.06      0.12        31
           3       0.25      0.03      0.06        31

   micro avg       0.86      0.25      0.39       144
   macro avg       0.65      0.23      0.30       144
weighted avg       0.67      0.25      0.32       144
 samples avg       0.25      0.25      0.25       144



  _warn_prf(average, modifier, msg_start, len(result))
