In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the student_exams.csv dataset.
file_path = "Resources/student_exams.csv"
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
5,6,male,group C,some high school,standard,none,68,57,54
6,7,female,group E,associate degree,standard,none,82,83,80
7,8,female,group B,some high school,standard,none,46,61,58
8,9,male,group C,some high school,standard,none,80,75,73
9,10,female,group C,bachelor degree,standard,completed,57,69,77


In [3]:
##Convert math scores 70 and over to 1 and scores below 70 to 0
df.loc[df["Math_Score"] < 70, "Math_Score"] = 0
df.loc[df["Math_Score"] >= 70, "Math_Score"] = 1

In [4]:
df.dtypes

Student_ID        int64
Sex              object
Ethnicity        object
PLE              object
Lunch            object
Test_Prep        object
Math_Score        int64
Reading_Score     int64
Writing_Score     int64
dtype: object

In [5]:
# Create our features

X = df.drop(['Math_Score','Student_ID'], axis=1)
X = pd.get_dummies(X, dtype="int64")

# Create our target
y = df.loc[:, "Math_Score"].copy()

In [6]:
X.dtypes

Reading_Score           int64
Writing_Score           int64
Sex_female              int64
Sex_male                int64
Ethnicity_group A       int64
Ethnicity_group B       int64
Ethnicity_group C       int64
Ethnicity_group D       int64
Ethnicity_group E       int64
PLE_associate degree    int64
PLE_bachelor degree     int64
PLE_high school         int64
PLE_master degree       int64
PLE_some college        int64
PLE_some high school    int64
Lunch_reduced           int64
Lunch_standard          int64
Test_Prep_completed     int64
Test_Prep_none          int64
dtype: object

In [7]:
X.head()

Unnamed: 0,Reading_Score,Writing_Score,Sex_female,Sex_male,Ethnicity_group A,Ethnicity_group B,Ethnicity_group C,Ethnicity_group D,Ethnicity_group E,PLE_associate degree,PLE_bachelor degree,PLE_high school,PLE_master degree,PLE_some college,PLE_some high school,Lunch_reduced,Lunch_standard,Test_Prep_completed,Test_Prep_none
0,70,78,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0
1,93,87,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1
2,76,77,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
3,70,63,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
4,85,86,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1


In [8]:
y = df["Math_Score"]

y.value_counts()

0    536
1    464
Name: Math_Score, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [10]:
Counter(y_train)

Counter({0: 408, 1: 342})

In [11]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [12]:
Counter(y_resampled)

Counter({0: 408, 1: 408})

In [13]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8916495901639344

In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[116,  12],
       [ 15, 107]], dtype=int64)

In [16]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.91      0.88      0.90      0.89      0.80       128
          1       0.90      0.88      0.91      0.89      0.89      0.79       122

avg / total       0.89      0.89      0.89      0.89      0.89      0.79       250



In [17]:
# Convert the target column values to pass and fail based on their values

# df.loc[df["Math_Score"] > 70, "Math_Score"] = 1
# df.loc[df["Math_Score"] < 70, "Math_Score"] = 0
# df


# for x in df.Math_Score:
#     if x >= 70:
#     df = df.Math_score.append('Math_Score':'Pass"                         


# df.loc[len(df.Math_score)]
# print(df)
# scores = df['Math_Score']
# scores.loc[scores >= 70]


# x = {df.loc[df['Math_Score'] >=70]: 'Pass'}   
# df = df.replace(x)

# x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
# df = df.replace(x)
# df.reset_index(inplace=True, drop=True)
# df.head()