In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('StudentsPerformance.DataMining.csv')

x = df.loc[:,['gender', 'race/ethnicity', 'parental level of education', 'test prep course', 'reading score', 'writing score']]

y = df['math score']

x['gender'] = x['gender'].astype(object)
x['race/ethnicity'] = x['race/ethnicity'].astype(object)
x['parental level of education'] = x['parental level of education'].astype(object)
x['test prep course'] = x['test prep course'].astype(object)
x = pd.get_dummies(x)

### assignment worked on with Neil Manderson

This data contains high school students' performances on 3 standardized tests.

It was obtained from kaggle.com.


The input variables are 'gender', 'race/ethnicity', 'parental level of education', and 'test prep course'.

There are 3 potential target variables, but for this assignment we will be using 'math score' as the target variable.

This dataset containes 1000 observations.

In [3]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Decision Tree

In [4]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.98875
Testing Accuracy: 0.06


In [5]:
param_grid = {'max_depth': range(1,10), 'criterion':['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best Parameters Are:', model.best_params_)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best Parameters Are: {'criterion': 'entropy', 'max_depth': 4}
Training Accuracy: 0.1275
Testing Accuracy: 0.05


## Decision Tree Regression

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

model = DecisionTreeRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9966144353869122
Rsquared on Testing 0.6615633344416878


In [7]:
param_grid = {'max_depth': np.arange(1,100,5)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_depth': 6}
Rsquared on Training 0.883749043523264
Rsquared on Testing 0.7698552015205828


# Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.95375
Testing Accuracy: 0.035


In [9]:
param_grid = {'n_estimators': np.arange(2,50,2), 'max_features': np.arange(2,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best Params Are:', model.best_params_)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best Params Are: {'max_features': 7, 'n_estimators': 12}
Training Accuracy: 0.98125
Testing Accuracy: 0.045


## Random Forest Regression

In [10]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.9634572229423011
Rsquared on Testing 0.7700345747314148


In [11]:
param_grid = {'n_estimators': np.arange(2,50, 2), 'max_features':np.arange(2,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_features': 6, 'n_estimators': 34}
Rsquared on Training 0.9724922206283496
Rsquared on Testing 0.7907882255044669


# Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9925
Testing Accuracy: 0.045


In [None]:
param_grid = {'n_estimators': np.arange(1,50,2), 'learning_rate': np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best Params Are:', model.best_params_)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

# this code block would not successfully run for me, often going 20+ minutes without error/completion

## Gradient Boosting Regression

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8947582536132821
Rsquared on Testing 0.8120288557453774


In [13]:
param_grid = {'n_estimators': np.arange(1,20, 2), 'learning_rate':np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.223, 'n_estimators': 17}
Rsquared on Training 0.8732248933387149
Rsquared on Testing 0.8078253562452823


# Adaboost

In [14]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train, y_train)

print('Testing Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Training Accuracy:', metrics.accuracy_score(y_pred, y_test))

Testing Accuracy: 0.0525
Training Accuracy: 0.055


In [15]:
param_grid = {'n_estimators': np.arange(1,100, 2), 'learning_rate':np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.334, 'n_estimators': 97}
Training Accuracy: 0.0525
Testing Accuracy: 0.025


## AdaBoost Regression

In [16]:
from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8378437978759534
Rsquared on Testing 0.7853746857028561


In [17]:
param_grid = {'n_estimators': np.arange(1,100, 2), 'learning_rate':np.linspace(0.001,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.667, 'n_estimators': 73}
Rsquared on Training 0.8400692792654855
Rsquared on Testing 0.7824529217896361


# K-Nearest Neighbors

In [18]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train, y_train)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.26875
Testing Accuracy: 0.03


In [19]:
param_grid = {'n_neighbors': np.arange(2,20, 2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Training Accuracy:', model.score(x_train, y_train))

y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'n_neighbors': 12}
Training Accuracy: 0.21
Testing Accuracy: 0.04


## KNN Regression

In [20]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8206274061215696
Rsquared on Testing 0.69044702550327


In [21]:
param_grid = {'n_neighbors':np.arange(1,10)}
model = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'n_neighbors': 7}
Rsquared on Training 0.8010525922007985
Rsquared on Testing 0.7103930999380541


# Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.115
Testing Accuracy: 0.04


# ElasticNet Regression

In [23]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
model = ElasticNet()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.7856829516788898
Rsquared on Testing 0.7564418395303305


In [24]:
param_grid = {'alpha':np.linspace(0,10, 10), 'l1_ratio':np.linspace(0,1,10)}
model = GridSearchCV(ElasticNet(), param_grid, cv = 5)
model.fit(x_train, y_train)

print('Best parameters are:', model.best_params_)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'alpha': 0.0, 'l1_ratio': 0.0}
Rsquared on Training 0.8600344102186028
Rsquared on Testing 0.8266219128471293


# Results

Had some issues running different tuning models for some of the different tree-based models as the tuning code blocks would take extended periods of time to finish running, if they ever completed. 

The best model was the ElasticNet Regression model

From performing the first model with ElasticNet, the RSquared was low on training and testing compared to others such as the Random Forest Regression model. However, once it was tuned, the model was not overfitted and did a reliable job of predicting the testing data. 

The tuning parameters are alpha = 0.0, l1_ratio = 0.0

This resulted in:

Training RSquared = 0.8552 

Testing RSquared = 0.8489