#**American Final Grade Predictor**

We collected data from almost 700 students from two portuguese high schools to predict grades.

Portuguese students are given a grade from 0 to 20. This scaling can be translated into the American grading system from A-F

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import r2_score, f1_score, confusion_matrix, classification_report, accuracy_score, mean_squared_error

from sklearn.model_selection import cross_validate, cross_val_score

In [None]:
from google.colab import files

uploaded = files.upload() #comment out this line after uploading file
df = pd.read_csv('student-por.csv')
df.head()

In [None]:
print(df.shape) #(649, 33)

print(df.columns)

#no null values
print(df.isnull().any())

In [None]:
#descriptive statustics
print(df.describe())

In [None]:
#Bar plots of three grade periods
plt.figure(figsize=(14,6))
plt.subplot(1,3,1)
sns.countplot(df["G1"], x="final grade")
plt.subplot(1,3,2)
sns.countplot(df["G2"], x="final grade")
plt.subplot(1,3,3)
sns.countplot(df["G3"], x="final grade")
plt.show()

In [None]:
#creating average grades with G1 and G2
df["current"] = round((df["G1"]+df["G2"])/2,2)
df=df.drop(["G1","G2"], axis=1)
df.head()

In [None]:
sns.distplot(df["G3"], bins=25)
plt.xlabel("Average Grade")

In [None]:
#Male and Female grade differences
male = df.query("sex=='M'")
female = df.query("sex=='F'")

male.shape #(266, 33)
female.shape #(383, 33)

plt.figure(figsize=(8,6))
sns.kdeplot(data=male["G3"], color="blue", label="Male grades", shade=True)
sns.kdeplot(data=female["G3"], color="pink", label="Female grades", shade=True)

plt.title("Histogram of grades, by Sex")

plt.legend()
plt.show()

#Males and Females seem to have a similar grade distribution. This feature might have a slight effect.

In [None]:
#is there a relationship between number of failed classes and final grade?
plt.figure(figsize=(10,6))

sns.boxplot(x="failures", y="G3", hue="sex",data=df)

plt.xlabel("Number of classes failed")
plt.ylabel("final grade")
plt.show()

#students with no failed classes have a higher median and maximum than those who have failed a class.

In [None]:
#Check the grades for each binary feature
boxplots = ['address','famsize','Pstatus','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']

plt.figure(figsize=(17,17))

for index, item in enumerate(boxplots):
  plt.subplot(4,3,index+1)
  sns.boxplot(x=item, y="G3", data=df)
  plt.ylabel("average grade (0-20)") 
  plt.title(item+" vs Average Grades")

plt.subplots_adjust(bottom=1, top = 2)

plt.show()
#There seems to be very little differences

In [None]:
bins = (-1,7.5,10,12.5,15,20)
grade_names = ("F","D","C","B","A")
df["American"] = pd.cut(df["G3"], bins = bins, labels = grade_names)
df["American"].unique()

In [None]:
#percentages of grades
(df["American"].value_counts().sort_index(ascending=False)/df.shape[0])*100

In [None]:
df["American"].value_counts().sort_index(ascending=False)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(2,1,1)
sns.countplot(x=df["American"], hue=df["reason"])

In [None]:
#Mothers' and Fathers' jobs
plt.figure(figsize=(10,10))

plt.subplot(2,1,1)
sns.countplot(x=df["Mjob"], hue=df["American"])
plt.subplot(2,1,2)
sns.countplot(x=df["Fjob"], hue=df["American"])
plt.show()

In [None]:
plt.figure(figsize=(10,20))

plt.subplot(2,1,1)
sns.boxplot(x="Mjob", y = "G3", data = df)
plt.subplot(2,1,2)
sns.boxplot(x="Fjob", y = "G3", data = df)
plt.show()

In [None]:
#One Hot Encode Mother Job
df = pd.concat([df, pd.get_dummies(df["Mjob"],prefix="Mjob")], axis=1)

In [None]:
df = df.drop(["Mjob", "Mjob_teacher"], axis=1)

In [None]:
#One Hot Encode Father Job
df = pd.concat([df, pd.get_dummies(df["Fjob"],prefix="Fjob")], axis=1)
df = df.drop(["Fjob", "Fjob_teacher"], axis=1)

In [None]:
df.head()

In [5]:
#Binarize columns
encoded_columns=['schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']


for column in encoded_columns:
  df[column] = [1 if value == "yes" else 0 for value in df[column]]

df['address'] = df['address'].map({"R": 0, "U":1})
df['famsize'] = df['famsize'].map({"LE3": 0, "GT3":1})
df['Pstatus'] = df['Pstatus'].map({"A": 0, "T":1})
df['sex'] = df['sex'].map({'M':0,'F':1})

In [None]:
df.head(10)

In [None]:
#correlations and visualization
corr = df.drop('American', axis=1).corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr,annot=True)
plt.show()
# Some correlation with address, Mother and father education, study time, higher education, access to internet, and family relationships

In [6]:
#Label encode American grades
le = LabelEncoder()
df["American"] = le.fit_transform(df["American"])

In [None]:
df.head()

##**Regression**

In [6]:
#seperate data into response and feature variables. (regression)
X = df[["sex",'address', 'Medu',"Fedu", "studytime","failures", "higher", "internet", "activities", "famrel","absences","current"]]
y = df["G3"]


#"Mjob_at_home","Mjob_health", "Mjob_other","Mjob_services","Fjob_at_home","Fjob_health","Fjob_other","Fjob_services"


In [10]:
#train-test split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=30)

transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)

In [None]:
#Fit Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
#cross validate
lm_cv = cross_validate(lin_reg, X_train, y_train, cv = 5, return_train_score=True, scoring=("r2","neg_mean_squared_error"))
print("r2 train accuracies:", lm_cv['train_r2'].mean(), "\nMRE train accuracies:", lm_cv['train_neg_mean_squared_error'].mean())
print("r2 test accuracies:", lm_cv['test_r2'].mean(), "\nMRE test accuracies:", lm_cv['test_neg_mean_squared_error'].mean())

In [None]:
#Polynomial fit
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X_train)
new_reg = LinearRegression()
new_reg.fit(X_poly,y_train)

In [None]:
#cross validate poly
import math
poly_cv = cross_validate(new_reg, X_poly, y_train, cv = 5, return_train_score=True, scoring=("r2","neg_mean_squared_error"))
print("r2 train accuracies:", poly_cv['train_r2'].mean(), "\nMRE train accuracies:", math.sqrt(-poly_cv['train_neg_mean_squared_error'].mean()))
print("r2 test accuracies:", poly_cv['test_r2'].mean(), "\nMRE test accuracies:", math.sqrt(-poly_cv['test_neg_mean_squared_error'].mean()))

In [None]:
# Liner regression predictions
lin_pred = lin_reg.predict(X_test)

X_poly_test = poly_reg.fit_transform(X_test)
poly_pred = new_reg.predict(X_poly_test)

In [None]:
print("r2_score for linear:", r2_score(y_test,lin_pred), mean_squared_error(y_test,lin_pred))
print("r2_score for polynomial of degree 2:", r2_score(y_test,poly_pred),mean_squared_error(y_test,poly_pred))

##**Classification**

In [None]:
#seperate data into response and feature variables. (classification)
X = df[["sex",'Medu',"Fedu", "studytime","failures","internet", "higher", "activities", "schoolsup","current"]]
y = df["American"]

In [None]:
#Train-test split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 30)

### K Nearest Neighbors

In [None]:
knn_real = KNeighborsClassifier(n_neighbors=11)
knn_real.fit(X_train,y_train)

In [None]:
knn_real_cv = cross_validate(knn_real,  X_train, y_train, cv = 10, return_train_score = True, scoring = "accuracy")
print("Training accuracies:", knn_real_cv['train_score'].mean(), "\nTest accuracies:", knn_real_cv['test_score'].mean())

In [None]:
k_range = range(1,41)
k_scores = []

for k in k_range:
  knn = KNeighborsClassifier(n_neighbors=k)
  scores = cross_validate(knn,  X_train, y_train, cv = 10, return_train_score = True, scoring = "accuracy")
  k_scores.append(scores["test_score"].mean())

In [None]:
sns.lineplot(x = k_range, y = k_scores) #11 n_neighbors remains the best

In [None]:
knn_pred = knn_real.predict(X_test)

In [None]:
print(confusion_matrix(y_test,knn_pred))
print(classification_report(y_test,knn_pred))
print(accuracy_score(y_test,knn_pred))

### Random Forest Classifier

In [7]:
X = df[["activities",'current',"Fedu", "higher","internet", "Medu", "schoolsup", "sex","studytime"]]
y = df["American"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 30)

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 400,max_depth = 6,random_state=0)
rf_clf.fit(X_train, y_train)

In [None]:
rf_cv = cross_validate(rf_clf,  X_train, y_train, cv = 5, return_train_score = True, scoring = "accuracy")
print("Training accuracies:", rf_cv['train_score'].mean(), "\nValidation accuracies:", rf_cv['test_score'].mean())

In [None]:
#feature importance
print(list(X_train.columns), "\nimportances", rf_clf.feature_importances_)

In [None]:
rf_pred = rf_clf.predict(X_test)
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))
print(accuracy_score(y_test,rf_pred))

## Support Vector Machines

In [13]:
X = df[["sex","address",'Medu',"Fedu", "studytime","internet", "higher", "activities","current"]]
y = df["American"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 30)

In [None]:
sm_clf = SVC(kernel="rbf", decision_function_shape="ovo", C=4)
sm_clf.fit(X_train, y_train)

In [None]:
sm_cv = cross_validate(sm_clf,  X_train, y_train, cv = 5, return_train_score = True, scoring = "accuracy")
print("Training accuracies:", sm_cv['train_score'].mean(), "\nValidation accuracies:", sm_cv['test_score'].mean())

In [None]:
sm_pred = sm_clf.predict(X_test)
print(confusion_matrix(y_test,sm_pred))
print(classification_report(y_test,sm_pred))
print(accuracy_score(y_test,sm_pred))

##Random forest classifier seems to do the best

In [None]:
import pickle
from sklearn.externals import joblib
filename = "random_forest_grade.joblib"
joblib.dump(rf_clf,filename)

from google.colab import files
files.download(filename)