In [None]:
# importing libraries, etc...

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

path = "https://raw.githubusercontent.com/LennardVaarten/ML-Workshops/main/data/"

Data on 359 K8 schools in California, USA. Here, we try to use the other features to predict the average reading score of students in a given school.
* students: Total enrollment.
* teachers: Number of teachers.
* calworks: Percent qualifying for CalWorks (income assistance).
* lunch: Percent qualifying for reduced-price lunch.
* computer: Number of computers.
* expenditure: Expenditure per student.
* income: District average income (in USD 1,000).
* english: Percent of English learners.
* reading_scores: Average score on reading test.

In [None]:
# loading

schools = pd.read_csv(path+"schools.csv")

In [None]:
# viewing

schools

In [None]:
# add new features: students per teacher, computers per student

schools["students_per_teacher"] = schools["students"] / schools["teachers"]
schools["computers_per_student"] = schools["computer"] / schools["students"]

In [None]:
schools

# **Scaling the Features**

In [None]:
# scaling the features

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(schools.iloc[:,1:])

schools.iloc[:,1:] = scaler.transform(schools.iloc[:,1:])

schools

In [None]:
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(schools.iloc[:,1:], 
                                                                            schools.iloc[:,0],
                                                                            random_state=99)

# **Plotting**

In [None]:
# plotting every feature against reading_scores

train = pd.concat([target_train, features_train],axis=1)

fig, axes = plt.subplots(5, 2, figsize=(15,20))
fig.tight_layout(pad=3)

for i in range(len(train.columns)-1):
  sns.scatterplot(ax=axes[i//2, i%2], x=train[train.columns[i+1]], y=train["reading_scores"])

In [None]:
features_train

In [None]:
# drop features that are made redundant by our newly made features

features_train.drop(columns=["teachers", "computer", "students"], inplace=True)
features_test.drop(columns=["teachers", "computer", "students"], inplace=True)

# **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(features_train, target_train)

print("Training set R2 score: {:.3f}".format(lr.score(features_train, target_train)))
print("Test set R2 score: {:.3f}".format(lr.score(features_test, target_test)))

In [None]:
# Having a look at the coefficients
# y = w[0]*x[0] + w[1]*x[1] + ... + b

for col, coef in zip(features_train.columns, lr.coef_):
    print("{:25}{:.2f}".format(col, coef))

print()

print("{:25}{:.2f}".format("intercept", lr.intercept_))

# **Regularization: Ridge (L2) and Lasso (L1)**

In [None]:
# training Ridge regression

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10).fit(features_train, target_train)
print("Training set R2 score: {:.3f}".format(ridge.score(features_train, target_train)))
print("Test set R2 score: {:.3f}".format(ridge.score(features_test, target_test)))

In [None]:
# training Lasso regression

from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1).fit(features_train, target_train)
print("Training set R2 score: {:.3f}".format(lasso.score(features_train, target_train)))
print("Test set R2 score: {:.3f}".format(lasso.score(features_test, target_test)))

In [None]:
# comparing the coefficients

print(f"{'FEATURE':<25}{'LR':<8}{'RIDGE':<8}{'LASSO':<8}")
for i, col in enumerate(features_train.columns):
  print(f"{col:<25}{round(lr.coef_[i], 2):<8}{round(ridge.coef_[i], 2):<8}{round(lasso.coef_[i], 2):<8} ")

In [None]:
# randomly sampling 100 observations from the dataset

schools_sample = schools.sample(n=100, random_state=99)

schools_sample

In [None]:
# splitting into training set and test set

features_train_sample, features_test_sample, target_train_sample, target_test_sample = train_test_split(schools_sample.iloc[:,:-1], 
                                                                                                        schools_sample.iloc[:,-1],
                                                                                                        test_size=0.35, 
                                                                                                        random_state=99)

In [None]:
# training linear regression on subsampled dataset

lr = LinearRegression().fit(features_train_sample, target_train_sample)

print("Training set R2 score: {:.3f}".format(lr.score(features_train_sample, target_train_sample)))
print("Test set R2 score: {:.3f}".format(lr.score(features_test_sample, target_test_sample)))

In [None]:
# ridge regression on subsampled dataset

ridge = Ridge(alpha=1).fit(features_train_sample, target_train_sample)
print("Training set R2 score: {:.3f}".format(ridge.score(features_train_sample, target_train_sample)))
print("Test set R2 score: {:.3f}".format(ridge.score(features_test_sample, target_test_sample)))

In [None]:
# lasso regression on subsampled dataset

lasso = Lasso(alpha=0.001).fit(features_train_sample, target_train_sample)
print("Training set R2 score: {:.3f}".format(lasso.score(features_train_sample, target_train_sample)))
print("Test set R2 score: {:.3f}".format(lasso.score(features_test_sample, target_test_sample)))

# **Decision Tree Regressor**

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(min_samples_split=20, max_depth=15).fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(tree.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(features_test, target_test)))

In [None]:
# having a look at the feature importances

fn = features_test.columns
fi = tree.feature_importances_

for feat, imp in zip(fn, fi):
    print(f"{feat:25} {imp:.3f}")

# **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=500, random_state=99)
rfr.fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(rfr.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(rfr.score(features_test, target_test)))

# **Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=4, subsample=0.2, learning_rate=0.01, random_state=99)
gbr.fit(features_train, target_train)

print("Accuracy on training set: {:.3f}".format(gbr.score(features_train, target_train)))
print("Accuracy on test set: {:.3f}".format(gbr.score(features_test, target_test)))


# **Grid Search**

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    "n_estimators": [500],
    "subsample": [0.4],
    "max_depth": [4, 6, 8, 10],
    "learning_rate": [0.0025, 0.005, 0.01, 0.1, 0.2],
}

gbr = GridSearchCV(estimator=GradientBoostingRegressor(),
                   param_grid=params, n_jobs=-1, cv=5) 

gbr.fit(features_train, target_train)

print("Best CV score: {:.4f}".format(gbr.best_score_))

In [None]:
print(gbr.best_params_)

## Visualizing the Results of Grid Search + CV

In [None]:
results = pd.DataFrame(gbr.cv_results_)

pvt = pd.pivot_table(pd.DataFrame(results),
    values='mean_test_score', index='param_max_depth', columns='param_learning_rate')

ax = sns.heatmap(pvt)

In [None]:
params = {
    "n_estimators": [500],
    "subsample": [0.4],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.004, 0.005, 0.00625, 0.0075],
}

gbr = GridSearchCV(estimator=GradientBoostingRegressor(),
                   param_grid=params, n_jobs=-1, cv=5) 

gbr.fit(features_train, target_train)

print(gbr.best_params_)
print("Best CV score: {:.4f}".format(gbr.best_score_))
print("Score on test set: {:.4f}".format(gbr.score(features_test, target_test)))

In [None]:
results = pd.DataFrame(gbr.cv_results_)

pvt = pd.pivot_table(pd.DataFrame(results),
    values='mean_test_score', index='param_max_depth', columns='param_learning_rate')

ax = sns.heatmap(pvt)