In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import os
import sys
sys.path.append("../scripts/")
import shap
import warnings
import numpy as np
import eval_metrics as evaluation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# from sklearn.impute import SimpleImputer, KNNImputer
# from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, log_loss,confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
warnings.filterwarnings('ignore')

In [4]:
path_to_csv = '../dataset/dataset.csv'
test_size = 0.3
random_seed = 1

In [5]:
def preprocessing_split(path_to_csv, split_data = True):
    df_school = pd.read_csv(path_to_csv)
    df_school['Target'].unique()
    df_school['Target'] = np.where(df_school['Target'] == 'Dropout', 'YES', 'NO')
    df_school['Target'] = df_school['Target'].apply(lambda x: 1 if x == 'YES' else 0)
    numeric_columns = list(set(df_school.select_dtypes(include='number').columns))
    categorical_columns = list(set(df_school.select_dtypes(exclude='number').columns))
    print(f"There are {len(numeric_columns)} numeric columns: {numeric_columns}")
    y=df_school['Target']
    X=df_school.iloc[:,:-1]
    sensitive_attribute = df_school[['Marital status', 'Nacionality', 'Gender']]
    if split_data:
        X_train, X_test, y_train, y_test, sens_train, sens_test = train_test_split(X, y,sensitive_attribute, test_size=test_size, random_state=random_seed)
        return X_train, X_test, y_train, y_test, sens_train, sens_test
    else:
        return X, y

In [6]:
X_train, X_test, y_train, y_test, sens_train, sens_test= preprocessing_split(path_to_csv, split_data = True)

There are 35 numeric columns: ['Application mode', 'Previous qualification', 'Target', 'Debtor', 'Scholarship holder', 'Nacionality', 'Age at enrollment', 'Curricular units 2nd sem (credited)', 'Application order', "Mother's qualification", 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Curricular units 1st sem (without evaluations)', 'Inflation rate', 'Curricular units 1st sem (evaluations)', 'Unemployment rate', 'Curricular units 2nd sem (evaluations)', 'Curricular units 1st sem (approved)', "Mother's occupation", 'Gender', 'Educational special needs', 'Curricular units 2nd sem (grade)', 'International', "Father's qualification", "Father's occupation", 'Course', 'GDP', 'Marital status', 'Curricular units 2nd sem (enrolled)', 'Curricular units 1st sem (credited)', 'Tuition fees up to date', 'Curricular units 2nd sem (approved)', 'Daytime/evening attendance', 'Displaced']


In [15]:
# sens_test.Gender

### Logistic Regression Model

In [16]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train )

In [29]:

mymask= sens_test.Gender
evaluation.model_metrics(logistic_model, X_test, y_test, mask=mymask,  fair_metrics= True)

Model accuracy 0.88
Model F1score 0.79
Model Recall 0.87 and precision 0.73
Model Test error is 0.12
DP is 0.40589569160997735 EO is 0.7609756097560976 fpr is 0.09745762497186661 accuracy_rate is 0.8367346938775511 recall rate is 0.7609756097560976


(0.40589569160997735,
 0.7609756097560976,
 0.097457625,
 0.8367346938775511,
 0.7609756097560976)

In [30]:
evaluation.model_metrics(logistic_model, X_test, y_test, mask=1-mymask,  fair_metrics= True)

Model accuracy 0.88
Model F1score 0.79
Model Recall 0.87 and precision 0.73
Model Test error is 0.12
DP is 0.18038331454340473 EO is 0.6965174129353234 fpr is 0.029154518619179726 accuracy_rate is 0.9086809470124013 recall rate is 0.6965174129353234


(0.18038331454340473,
 0.6965174129353234,
 0.029154519,
 0.9086809470124013,
 0.6965174129353234)

### Random Forest Classifier

In [35]:
rf_classfier = RandomForestClassifier()
rf_classfier.fit(X_train,y_train)

In [33]:
evaluation.model_metrics(rf_classfier, X_test, y_test, mask=mymask,  fair_metrics= True)

Model accuracy 0.88
Model F1score 0.78
Model Recall 0.86 and precision 0.72
Model Test error is 0.12
DP is 0.4036281179138322 EO is 0.7365853658536585 fpr is 0.1144067794084549 accuracy_rate is 0.8163265306122449 recall rate is 0.7365853658536585


(0.4036281179138322,
 0.7365853658536585,
 0.11440678,
 0.8163265306122449,
 0.7365853658536585)

In [34]:
evaluation.model_metrics(rf_classfier, X_test, y_test, mask=1-mymask,  fair_metrics= True)

Model accuracy 0.88
Model F1score 0.78
Model Recall 0.86 and precision 0.72
Model Test error is 0.12
DP is 0.18151071025930102 EO is 0.6965174129353234 fpr is 0.030612245202064514 accuracy_rate is 0.9075535512965051 recall rate is 0.6965174129353234


(0.18151071025930102,
 0.6965174129353234,
 0.030612245,
 0.9075535512965051,
 0.6965174129353234)