In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../data/StudentPerformanceFactorsCleaned.csv')
y = data['Exam_Score']
X = data.drop(columns=['Exam_Score'], axis=1)
X.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Family_Income,Teacher_Quality,...,Learning_Disabilities_No,Learning_Disabilities_Yes,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Far,Distance_from_Home_Moderate,Distance_from_Home_Near,Gender_Female,Gender_Male
0,0.6,0.6,0.5,0.0,0.5,0.46,0.5,0.0,0.5,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.466667,0.1,0.5,1.0,0.666667,0.18,0.5,0.25,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.633333,0.95,1.0,1.0,0.5,0.82,1.0,0.25,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.8,0.725,0.5,1.0,0.666667,0.96,1.0,0.125,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.466667,0.8,1.0,1.0,0.333333,0.3,1.0,0.375,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [3]:
pca = PCA(n_components=15)
data_transformed = pca.fit_transform(X)
data_transformed = pd.DataFrame(data_transformed)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Random Forest
rf = RandomForestRegressor()
rfclf = GridSearchCV(rf, {'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [5, 10, 20, 30, 40], 'min_samples_leaf':[1,2,3,4], 'max_features':['sqrt', 'log2', 1.0, 2, 0.5]}, verbose=1, scoring='neg_root_mean_squared_error')
rfclf.fit(X_train, y_train)
print('Random Forest')


Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Random Forest


In [6]:
df = pd.DataFrame(rfclf.cv_results_)
df = df.sort_values(by='rank_test_score')
df.to_csv('../results/RandomForestResults.csv')

In [7]:
# SVM
svm = SVR()
clf = GridSearchCV(svm, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel':[
    'rbf', 'linear', 'poly', 'sigmoid'
]}, verbose=1, scoring='neg_root_mean_squared_error')
clf.fit(X_train, y_train)
print('SVM')


Fitting 5 folds for each of 32 candidates, totalling 160 fits
SVM


In [8]:
df = pd.DataFrame(clf.cv_results_)
df = df.sort_values(by='rank_test_score')
df.to_csv('../results/SVM_Results.csv')

In [9]:
# Logistic Regression
linear = LogisticRegression(max_iter=1000)
lrclf = GridSearchCV(linear, {'C': [10, 100, 1000], 'solver':['liblinear', 'sag', 'saga', 'newton-cg']}, verbose=1, scoring='neg_root_mean_squared_error')
lrclf.fit(X_train, y_train)
print('Logistic Regression')
print('Best Parameters:', lrclf.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Logistic Regression
Best Parameters: {'C': 10, 'solver': 'saga'}


In [10]:
df = pd.DataFrame(lrclf.cv_results_)
df = df.sort_values(by='rank_test_score')
df.to_csv('../results/LR_Results.csv')