
A local school district has a goal to reach a 95% graduation rate by the end of the 
decade by identifying students who need intervention before they drop out of 
school. As a software engineer contacted by the school district, your task is to 
model the factors that predict how likely a student is to pass their high school final 
exam, by constructing an intervention system that leverages supervised learning 
techniques. The board of supervisors has asked that you find the most effective 
model that uses the least amount of computation costs to save on the budget. You 
will need to analyze the dataset on students' performance and develop a model 
that will predict a given student will pass, quantifying whether an intervention is 
necessary

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv('student-data.csv')

In [None]:
data

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data['passed'].value_counts(normalize=True)

In [None]:
X=data.drop(['passed'],axis=1)
y=data['passed']
y.head()

In [None]:
#label encoding

In [None]:
X=data.drop(['passed'],axis=1)
y=data['passed']
from sklearn.preprocessing import LabelEncoder
label_en=LabelEncoder()
y=label_en.fit_transform(y)
label_encoders={}
ordinal_features=['school','sex','address','famsize','Pstatus','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
for i in ordinal_features:
    label_encoders[i]=LabelEncoder()
    X[i]=label_encoders[i].fit_transform(X[i])
X.head()

In [None]:
#one hot encoding

In [None]:
X=pd.get_dummies(X)
X.head()

In [None]:
#logistic regression

In [None]:
results_arr = {'Accuracy':[],'Precision':[],'Recall':[],'F1_Score':[],'Failure%':[],'Time_Taken':[]}

In [None]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
start_time = time.time()

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)
logit_reg=LogisticRegression()
model=logit_reg.fit(X_train,y_train)
predictions=model.predict(X_test)
end_time = time.time()

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
accuracy1 = accuracy_score(y_test,predictions)
precision1 = precision_score(y_test,predictions)
recall1 = recall_score(y_test,predictions)
f11 = f1_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
fail_percent = cm[1][0]/(np.sum(cm))*100
time_taken = end_time - start_time
results_arr['Accuracy'].append(accuracy1)
results_arr['Precision'].append(precision1)
results_arr['Recall'].append(recall1)
results_arr['F1_Score'].append(f11)
results_arr['Failure%'].append(fail_percent)
results_arr['Time_Taken'].append(time_taken)
print('Accuracy is',accuracy1)
print('precision is',precision1)
print('recall is',recall1)
print('f1_score is',f11)
print(cm)
print('% of students failing after intervention = ',fail_percent)
print('Time taken for Logistic Regression = ',time_taken, ' seconds')

In [None]:
#KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
values=[]
start_time = time.time()
neighbors=np.arange(3,15)
for k in neighbors:
    classifier=KNeighborsClassifier(n_neighbors=k,metric='minkowski')
    classifier.fit(X_train,y_train)
    predictions=classifier.predict(X_test)
    acc=accuracy_score(y_test,predictions)
    values.append(acc)
plt.plot(neighbors,values,'o-')

In [None]:
classifier=KNeighborsClassifier(n_neighbors=13,metric='minkowski')
classifier.fit(X_train,y_train)
predictions=classifier.predict(X_test)
end_time = time.time()

In [None]:
accuracy1 = accuracy_score(y_test,predictions)
precision1 = precision_score(y_test,predictions)
recall1 = recall_score(y_test,predictions)
f11 = f1_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
fail_percent = cm[1][0]/(np.sum(cm))*100
time_taken = end_time - start_time

results_arr['Accuracy'].append(accuracy1)
results_arr['Precision'].append(precision1)
results_arr['Recall'].append(recall1)
results_arr['F1_Score'].append(f11)
results_arr['Failure%'].append(fail_percent)
results_arr['Time_Taken'].append(time_taken)

print('Accuracy is',accuracy1)
print('precision is',precision1)
print('recall is',recall1)
print('f1_score is',f11)
print(cm)
print('% of students failing after intervention = ',fail_percent)
print('Time taken for KNN = ',time_taken, ' seconds')

In [None]:
#Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
start_time = time.time()
dt_model=DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictions=dt_model.predict(X_test)
end_time = time.time()

In [None]:
accuracy1 = accuracy_score(y_test,predictions)
precision1 = precision_score(y_test,predictions)
recall1 = recall_score(y_test,predictions)
f11 = f1_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
fail_percent = cm[1][0]/(np.sum(cm))*100
time_taken = end_time - start_time

results_arr['Accuracy'].append(accuracy1)
results_arr['Precision'].append(precision1)
results_arr['Recall'].append(recall1)
results_arr['F1_Score'].append(f11)
results_arr['Failure%'].append(fail_percent)
results_arr['Time_Taken'].append(time_taken)

print('Accuracy is',accuracy1)
print('precision is',precision1)
print('recall is',recall1)
print('f1_score is',f11)
print(cm)
print('% of students failing after intervention = ',fail_percent)
print('Time taken for Decision Tree = ',time_taken, ' seconds')

In [None]:
#SVM

In [None]:
from sklearn.svm import SVC
start_time = time.time()
svm_linear=SVC(kernel='linear')
svm_linear.fit(X_train,y_train)
predictions=svm_linear.predict(X_test)
end_time = time.time()

In [None]:
accuracy1 = accuracy_score(y_test,predictions)
precision1 = precision_score(y_test,predictions)
recall1 = recall_score(y_test,predictions)
f11 = f1_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
fail_percent = cm[1][0]/(np.sum(cm))*100
time_taken = end_time - start_time

results_arr['Accuracy'].append(accuracy1)
results_arr['Precision'].append(precision1)
results_arr['Recall'].append(recall1)
results_arr['F1_Score'].append(f11)
results_arr['Failure%'].append(fail_percent)
results_arr['Time_Taken'].append(time_taken)

print('Accuracy is',accuracy1)
print('precision is',precision1)
print('recall is',recall1)
print('f1_score is',f11)
print(cm)
print('% of students failing after intervention = ',fail_percent)
print('Time taken for SVM = ',time_taken, ' seconds')

In [None]:
#Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
predictions = rf.predict(X_test)
end_time = time.time()

In [None]:
accuracy1 = accuracy_score(y_test,predictions)
precision1 = precision_score(y_test,predictions)
recall1 = recall_score(y_test,predictions)
f11 = f1_score(y_test,predictions)
cm = confusion_matrix(y_test,predictions)
fail_percent = cm[1][0]/(np.sum(cm))*100
time_taken = end_time - start_time

results_arr['Accuracy'].append(accuracy1)
results_arr['Precision'].append(precision1)
results_arr['Recall'].append(recall1)
results_arr['F1_Score'].append(f11)
results_arr['Failure%'].append(fail_percent)
results_arr['Time_Taken'].append(time_taken)

print('Accuracy is',accuracy1)
print('precision is',precision1)
print('recall is',recall1)
print('f1_score is',f11)
print(cm)
print('% of students failing after intervention = ',fail_percent)
print('Time taken for Random Forest = ',time_taken, ' seconds')

In [None]:
print ("{:<12} | {:<12} | {:<12} | {:<12} | {:<12} | {:<12}".format('Item', 'LR', 'KNN','DT', 'SVM','RF'))
print('-'*80) 


for key, value in results_arr.items():
    a1, a2, a3, a4, a5 = value
    a1 = np.round(a1,3)
    a2 = np.round(a2,3)
    a3 = np.round(a3,3)
    a4 = np.round(a4,3)
    a5 = np.round(a5,3)
    print ("{:<12} | {:<12} | {:<12} | {:<12} | {:<12} | {:<12}".format(key, a1, a2, a3, a4, a5))

In [None]:
#Best F1 score is for Random Forest 
#Lowest failure % is for by KNN.
#KNN has the highest amount of compuatations and time and this include training time also.
#so If the school is only worried about computation during prediciton, we can choose KNN.
#Decision Tree has  least time but it has high failure rate.