In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
# Load the student_exams.csv dataset.
file_path = "Resources/student_exams.csv"
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
5,6,male,group C,some high school,standard,none,68,57,54
6,7,female,group E,associate degree,standard,none,82,83,80
7,8,female,group B,some high school,standard,none,46,61,58
8,9,male,group C,some high school,standard,none,80,75,73
9,10,female,group C,bachelor degree,standard,completed,57,69,77


In [3]:
##Convert math scores 70 and over to 1 and scores below 70 to 0
df.loc[df["Math_Score"] < 70, "Math_Score"] = 0
df.loc[df["Math_Score"] >= 70, "Math_Score"] = 1

In [4]:
df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,0,70,78
1,2,male,group D,associate degree,standard,none,1,93,87
2,3,female,group D,some college,reduced,none,0,76,77
3,4,male,group B,some college,reduced,none,1,70,63
4,5,female,group D,associate degree,standard,none,1,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,1,77,71
996,997,male,group C,some college,standard,none,1,66,66
997,998,female,group A,high school,standard,completed,0,86,86
998,999,male,group E,high school,standard,none,1,72,62


In [5]:
# Create our features

X = df.drop(['Math_Score','Student_ID'], axis=1)
X = pd.get_dummies(X, dtype="int64")

# Create our target
y = df.loc[:, "Math_Score"].copy()

In [6]:
X.head()

Unnamed: 0,Reading_Score,Writing_Score,Sex_female,Sex_male,Ethnicity_group A,Ethnicity_group B,Ethnicity_group C,Ethnicity_group D,Ethnicity_group E,PLE_associate degree,PLE_bachelor degree,PLE_high school,PLE_master degree,PLE_some college,PLE_some high school,Lunch_reduced,Lunch_standard,Test_Prep_completed,Test_Prep_none
0,70,78,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0
1,93,87,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1
2,76,77,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
3,70,63,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
4,85,86,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1


In [7]:
y

0      0
1      1
2      0
3      1
4      1
      ..
995    1
996    1
997    0
998    1
999    0
Name: Math_Score, Length: 1000, dtype: int64

In [8]:
# Check the balance of our target values
y = df["Math_Score"]

y.value_counts()

0    536
1    464
Name: Math_Score, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)

In [11]:
print("Balanced Random Forest classifier performance:")
print(
    f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_brf):.2f} - "
    f"Geometric mean {geometric_mean_score(y_test, y_pred_brf):.2f}"
)

Balanced Random Forest classifier performance:
Balanced accuracy: 0.86 - Geometric mean 0.86


In [12]:
# Calculated the balanced accuracy score
print("Balanced Random Forest classifier performance:")
print(
    f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_brf)} ")

Balanced Random Forest classifier performance:
Balanced accuracy: 0.863921618852459 


In [13]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_brf)
cm

array([[111,  17],
       [ 17, 105]], dtype=int64)

In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.87      0.86      0.87      0.86      0.75       128
          1       0.86      0.86      0.87      0.86      0.86      0.75       122

avg / total       0.86      0.86      0.86      0.86      0.86      0.75       250



In [15]:
# List the features sorted in descending order by feature importance
features_importance = brf.feature_importances_
feature_names = brf.feature_names_in_
features_pd = pd.Series(features_importance, feature_names)
features_pd.sort_values(ascending=False)
print(features_pd.sort_values(ascending=False).to_string())

Writing_Score           0.333293
Reading_Score           0.330007
Sex_female              0.050016
Sex_male                0.049627
Lunch_standard          0.040965
Lunch_reduced           0.025735
Ethnicity_group C       0.017027
Test_Prep_none          0.016786
Test_Prep_completed     0.016477
Ethnicity_group E       0.016447
PLE_associate degree    0.014595
PLE_some college        0.013685
Ethnicity_group D       0.012611
PLE_some high school    0.012247
PLE_bachelor degree     0.011427
PLE_high school         0.011178
Ethnicity_group B       0.010730
PLE_master degree       0.010240
Ethnicity_group A       0.006907
