In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
import sqlalchemy
import psycopg2
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, text
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, auc,roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
# # Load the student_exams.csv dataset.
# file_path = "Resources/student_exams.csv"
# df = pd.read_csv(file_path)
# df.head(10)

In [3]:
engine_cloud = sqlalchemy.create_engine('postgresql://postgres:Lola2022%21@localhost:5432/Education_Preformance')

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine_cloud, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['student_exams', 'exam_scores', 'demographics']

In [6]:
# Save reference for table
student_exams = Base.classes.student_exams

In [7]:
session = Session(engine_cloud)

In [8]:
# Write a query in SQLALchemy
query = session.query(student_exams)

In [9]:
df = pd.DataFrame(engine_cloud.connect().execute(text(str(query))))
df.columns = ['Student_ID', 'Sex', 'Ethnicity', 'PLE', 'Lunch', 'Test_Prep',  'Math_Score', 'Reading_Score', 'Writing_Score']
df.head(10)

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
5,6,male,group C,some high school,standard,none,68,57,54
6,7,female,group E,associate degree,standard,none,82,83,80
7,8,female,group B,some high school,standard,none,46,61,58
8,9,male,group C,some high school,standard,none,80,75,73
9,10,female,group C,bachelor degree,standard,completed,57,69,77


In [10]:
#Drop Math Score outliers based on EDA 
df.drop(df[df["Math_Score"] <= 24].index, inplace = True)

#Drop Reading Score outliers based on EDA 
df.drop(df[df["Reading_Score"] <= 32].index, inplace = True)

#Drop Writing Score outliers based on EDA 
df.drop(df[df["Writing_Score"] <= 27].index, inplace = True)

df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,77,77,71
996,997,male,group C,some college,standard,none,80,66,66
997,998,female,group A,high school,standard,completed,67,86,86
998,999,male,group E,high school,standard,none,80,72,62


In [11]:
##Convert math scores 70 and over to 1 and scores below 70 to 0
df.loc[df["Math_Score"] < 70, "Math_Score"] = 0
df.loc[df["Math_Score"] >= 70, "Math_Score"] = 1

In [12]:
df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,0,70,78
1,2,male,group D,associate degree,standard,none,1,93,87
2,3,female,group D,some college,reduced,none,0,76,77
3,4,male,group B,some college,reduced,none,1,70,63
4,5,female,group D,associate degree,standard,none,1,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,1,77,71
996,997,male,group C,some college,standard,none,1,66,66
997,998,female,group A,high school,standard,completed,0,86,86
998,999,male,group E,high school,standard,none,1,72,62


In [13]:
# Create our features
X = df.drop(['Math_Score','Student_ID', 'Writing_Score'], axis=1)
X = pd.get_dummies(X, dtype="int64")

# Create our target
y = df.loc[:, "Math_Score"].copy()

In [14]:
X.head()

Unnamed: 0,Reading_Score,Sex_female,Sex_male,Ethnicity_group A,Ethnicity_group B,Ethnicity_group C,Ethnicity_group D,Ethnicity_group E,PLE_associate degree,PLE_bachelor degree,PLE_high school,PLE_master degree,PLE_some college,PLE_some high school,Lunch_reduced,Lunch_standard,Test_Prep_completed,Test_Prep_none
0,70,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0
1,93,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1
2,76,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
3,70,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
4,85,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1


In [15]:
y

0      0
1      1
2      0
3      1
4      1
      ..
995    1
996    1
997    0
998    1
999    0
Name: Math_Score, Length: 991, dtype: int64

In [16]:
# Check the balance of our target values
y = df["Math_Score"]

y.value_counts()

0    527
1    464
Name: Math_Score, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
print(GradientBoostingClassifier())
print(GradientBoostingRegressor())

GradientBoostingClassifier()
GradientBoostingRegressor()


In [19]:
gbc = GradientBoostingClassifier()
parameters = {
    "n_estimators":[5,50,250,500],
    'max_features':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

In [20]:
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(gbc,parameters,cv=5)
cv.fit(X_train,y_train.values.ravel())

In [21]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [22]:
display(cv)

Best parameters are: {'learning_rate': 0.1, 'max_depth': 1, 'max_features': 4, 'n_estimators': 250}


0.546 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 1, 'n_estimators': 5}
0.546 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 1, 'n_estimators': 50}
0.733 + or -0.023 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 1, 'n_estimators': 250}
0.8 + or -0.022 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 1, 'n_estimators': 500}
0.546 + or -0.003 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 2, 'n_estimators': 5}
0.572 + or -0.024 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 2, 'n_estimators': 50}
0.8 + or -0.031 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 2, 'n_estimators': 250}
0.816 + or -0.026 for the {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 2, 'n_estimators': 500}
0.546 + or -0.003 for the {'learning_rate': 0.01, 'max_depth

In [23]:
# #Parameter learning rates
# learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# for learning_rate in learning_rates:
#     classifier = GradientBoostingClassifier(n_estimators=20,
#                                             learning_rate=learning_rate,
#                                             max_features=5,
#                                             max_depth=3)

#     # Fit the model
#     classifier.fit(X_train, y_train)
#     print("Learning rate: ", learning_rate)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier.score(
#             X_train,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier.score(
#             X_test,
#             y_test)))
#     print()

In [24]:
# # parameter n_estimators
# n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
# for estimator in n_estimators:
#     classifier2 = GradientBoostingClassifier(n_estimators=estimator,
#                                             learning_rate=0.5,
#                                             max_features=5,
#                                             max_depth=3)

#     # Fit the model
#     classifier2.fit(X_train, y_train)
#     print("n_estimators: ", estimator)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier2.score(
#             X_train,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier2.score(
#             X_test,
#             y_test)))
#     print()

In [25]:
# # parameter max_features
# max_features = list(range(1,X.shape[1]))
# for max_feature in max_features:
#     classifier3 = GradientBoostingClassifier(n_estimators=16,
#                                             learning_rate=0.5,
#                                             max_features=max_feature,
#                                             max_depth=3)

#     # Fit the model
#     classifier2.fit(X_train, y_train)
#     print("max_features: ", max_feature)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier2.score(
#             X_train,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier2.score(
#             X_test,
#             y_test)))
#     print()

In [26]:
# # parameter max_depth

# max_depths = np.linspace(1, 32, 32, endpoint=True)
# for max_depth in max_depths:
#     classifier3 = GradientBoostingClassifier(n_estimators=16,
#                                             learning_rate=0.5,
#                                             max_features=5,
#                                             max_depth=max_depth)

#     # Fit the model
#     classifier2.fit(X_train, y_train)
#     print("max_depths: ", max_depth)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier2.score(
#             X_train,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier2.score(
#             X_test,
#             y_test)))
#     print()

In [31]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=250,
                                        learning_rate=0.1,
                                        max_features=4,
                                        max_depth=1)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)


# 'learning_rate': 0.1, 'max_depth': 1, 'max_features': 4, 'n_estimators': 250

Unnamed: 0,Prediction,Actual
664,1,1
733,0,0
337,0,0
684,1,1
95,1,1
303,1,1
686,1,1
238,1,1
109,1,1
812,1,1


In [33]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8842649834059999
