In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
import sqlalchemy
import psycopg2
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, text
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# # Load the student_exams.csv dataset.
# file_path = "Resources/student_exams.csv"
# df = pd.read_csv(file_path)
# df.head(10)

In [3]:
engine_cloud = sqlalchemy.create_engine('postgresql://postgres:Lola2022%21@localhost:5432/Education_Preformance')

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine_cloud, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['demographics', 'student_exams', 'exam_scores']

In [6]:
# Save reference for table
student_exams = Base.classes.student_exams

In [7]:
session = Session(engine_cloud)

In [8]:
# Write a query in SQLALchemy
query = session.query(student_exams)

In [9]:
df = pd.DataFrame(engine_cloud.connect().execute(text(str(query))))
df.columns = ['Student_ID', 'Sex', 'Ethnicity', 'PLE', 'Lunch', 'Test_Prep',  'Math_Score', 'Reading_Score', 'Writing_Score']
df.head(10)

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
5,6,male,group C,some high school,standard,none,68,57,54
6,7,female,group E,associate degree,standard,none,82,83,80
7,8,female,group B,some high school,standard,none,46,61,58
8,9,male,group C,some high school,standard,none,80,75,73
9,10,female,group C,bachelor degree,standard,completed,57,69,77


In [10]:
#Drop Math Score outliers based on EDA 
df.drop(df[df["Math_Score"] <= 24].index, inplace = True)
df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,77,77,71
996,997,male,group C,some college,standard,none,80,66,66
997,998,female,group A,high school,standard,completed,67,86,86
998,999,male,group E,high school,standard,none,80,72,62


In [11]:
#Drop Reading Score outliers based on EDA 
df.drop(df[df["Reading_Score"] <= 32].index, inplace = True)
df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,77,77,71
996,997,male,group C,some college,standard,none,80,66,66
997,998,female,group A,high school,standard,completed,67,86,86
998,999,male,group E,high school,standard,none,80,72,62


In [12]:
#Drop Math Score outliers based on EDA 
df.drop(df[df["Writing_Score"] <= 27].index, inplace = True)
df

Unnamed: 0,Student_ID,Sex,Ethnicity,PLE,Lunch,Test_Prep,Math_Score,Reading_Score,Writing_Score
0,1,female,group D,some college,standard,completed,59,70,78
1,2,male,group D,associate degree,standard,none,96,93,87
2,3,female,group D,some college,reduced,none,57,76,77
3,4,male,group B,some college,reduced,none,70,70,63
4,5,female,group D,associate degree,standard,none,83,85,86
...,...,...,...,...,...,...,...,...,...
995,996,male,group C,some college,standard,none,77,77,71
996,997,male,group C,some college,standard,none,80,66,66
997,998,female,group A,high school,standard,completed,67,86,86
998,999,male,group E,high school,standard,none,80,72,62


In [13]:
##Convert math scores 70 and over to 1 and scores below 70 to 0
df.loc[df["Math_Score"] < 70, "Math_Score"] = 0
df.loc[df["Math_Score"] >= 70, "Math_Score"] = 1

In [14]:
# Create our features
X = df.drop(['Math_Score','Student_ID', 'Writing_Score'], axis=1)
X = pd.get_dummies(X, dtype="int64")

# Create our target
y = df.loc[:, "Math_Score"].copy()

In [15]:
X.head()

Unnamed: 0,Reading_Score,Sex_female,Sex_male,Ethnicity_group A,Ethnicity_group B,Ethnicity_group C,Ethnicity_group D,Ethnicity_group E,PLE_associate degree,PLE_bachelor degree,PLE_high school,PLE_master degree,PLE_some college,PLE_some high school,Lunch_reduced,Lunch_standard,Test_Prep_completed,Test_Prep_none
0,70,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0
1,93,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1
2,76,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
3,70,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
4,85,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1


In [16]:
y

0      0
1      1
2      0
3      1
4      1
      ..
995    1
996    1
997    0
998    1
999    0
Name: Math_Score, Length: 991, dtype: int64

In [17]:
# Check the balance of our target values
y = df["Math_Score"]

y.value_counts()

0    527
1    464
Name: Math_Score, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    )

In [19]:
# # Creating StandardScaler instance
# scaler = StandardScaler()

# # Fitting Standard Scaler
# X_scaler = scaler.fit(X_train)

# # Scaling data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [20]:
#Fitting K-NN classifier to the training set  
from sklearn.neighbors import KNeighborsClassifier  
# define models and parameters
KNN_classifier = KNeighborsClassifier()  
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid_search = GridSearchCV(estimator=KNN_classifier, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train) 
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.860895 using {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
0.799856 (0.040287) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.799856 (0.040287) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.833087 (0.045518) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.832643 (0.039746) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.843838 (0.040692) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.841586 (0.040461) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.844709 (0.034840) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.841129 (0.031674) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.855063 (0.036228) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.851465 (0.032350) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.850078 