### Part 1 Introduction

This is a demonstration of supervised machine learning methods for a continous target variable. The data used is the California Test Score Data Set extracted from the link below. It is a subset of data from the California Department of Education from 1998 to 1999. The target variable is "testscr", which is the average test score for a student. This section explores what variables have the highest predictive power in determining the student's test score using the various supervised learning models available.

Data Source Link: "https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Caschool.csv"

In [1]:
# importing the data
import pandas as pd
df = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Caschool.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,distcod,county,district,grspan,enrltot,teachers,calwpct,mealpct,computer,testscr,compstu,expnstu,str,avginc,elpct,readscr,mathscr
0,1,75119,Alameda,Sunol Glen Unified,KK-08,195,10.9,0.5102,2.0408,67,690.799988,0.34359,6384.911133,17.88991,22.690001,0.0,691.599976,690.0
1,2,61499,Butte,Manzanita Elementary,KK-08,240,11.15,15.4167,47.916698,101,661.200012,0.420833,5099.380859,21.524664,9.824,4.583333,660.5,661.900024
2,3,61549,Butte,Thermalito Union Elementary,KK-08,1550,82.900002,55.032299,76.322601,169,643.599976,0.109032,5501.95459,18.697226,8.978,30.000002,636.299988,650.900024
3,4,61457,Butte,Golden Feather Union Elementary,KK-08,243,14.0,36.475399,77.049202,85,647.700012,0.349794,7101.831055,17.357143,8.978,0.0,651.900024,643.5
4,5,61523,Butte,Palermo Union Elementary,KK-08,1335,71.5,33.108601,78.427002,171,640.849976,0.12809,5235.987793,18.671329,9.080333,13.857677,641.799988,639.900024
5,6,62042,Fresno,Burrel Union Elementary,KK-08,137,6.4,12.3188,86.956497,25,605.550049,0.182482,5580.146973,21.40625,10.415,12.408759,605.700012,605.400024
6,7,68536,San Joaquin,Holt Union Elementary,KK-08,195,10.0,12.9032,94.623703,28,606.75,0.14359,5253.331055,19.5,6.577,68.717949,604.5,609.0
7,8,63834,Kern,Vineland Elementary,KK-08,888,42.5,18.806299,100.0,66,609.0,0.074324,4565.746094,20.894117,8.174,46.959461,605.5,612.5
8,9,62331,Fresno,Orange Center Elementary,KK-08,379,19.0,32.189999,93.139801,35,612.5,0.092348,5355.54834,19.947369,7.385,30.079157,608.900024,616.099976
9,10,67306,Sacramento,Del Paso Heights Elementary,KK-06,2247,108.0,78.994202,87.316399,0,612.650024,0.0,5036.211426,20.805555,11.613333,40.275921,611.900024,613.400024


In [12]:
# preparing the data

df = df.iloc[:, 5:] # deletes the first five columns, as the first coumn just denotes number assignment and the next four variables are categorical/discrete in nature
y = df['testscr'] # since average test score is our target variable
X = df.loc[:, (df.columns != 'testscr') & (df.columns != 'readscr') & (df.columns != 'mathscr')] #we are testing for all variables that are not testscr, readscr, mathscr

X.head(10)

Unnamed: 0,enrltot,teachers,calwpct,mealpct,computer,compstu,expnstu,str,avginc,elpct
0,195,10.9,0.5102,2.0408,67,0.34359,6384.911133,17.88991,22.690001,0.0
1,240,11.15,15.4167,47.916698,101,0.420833,5099.380859,21.524664,9.824,4.583333
2,1550,82.900002,55.032299,76.322601,169,0.109032,5501.95459,18.697226,8.978,30.000002
3,243,14.0,36.475399,77.049202,85,0.349794,7101.831055,17.357143,8.978,0.0
4,1335,71.5,33.108601,78.427002,171,0.12809,5235.987793,18.671329,9.080333,13.857677
5,137,6.4,12.3188,86.956497,25,0.182482,5580.146973,21.40625,10.415,12.408759
6,195,10.0,12.9032,94.623703,28,0.14359,5253.331055,19.5,6.577,68.717949
7,888,42.5,18.806299,100.0,66,0.074324,4565.746094,20.894117,8.174,46.959461
8,379,19.0,32.189999,93.139801,35,0.092348,5355.54834,19.947369,7.385,30.079157
9,2247,108.0,78.994202,87.316399,0,0.0,5036.211426,20.805555,11.613333,40.275921


In [13]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) #splitting the data between test and train sets

In [14]:
# KNN Regressor Code Block
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

print("Training set score: {:.3f}".format(knn.score(X_train, y_train)))
print("Test set score: {:.3f}".format(knn.score(X_test, y_test)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn, X_train, y_train, cv = 10, scoring="r2"))))


Training set score: 0.303
Test set score: 0.085
R squared: -0.074


In [15]:
# Linear Regression Code Block
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)

print("Training set score: {:.3f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lr.score(X_test, y_test)))
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lr, X_train, y_train, cv=10, scoring="r2"))))


Training set score: 0.805
Test set score: 0.807
lr.coef_: [ 2.37299343e-04 -9.68465212e-03 -1.05122232e-01 -3.63632211e-01
  2.23981150e-03 -1.50110373e+00  1.87297506e-03 -3.68530746e-01
  5.17180694e-01 -1.97462843e-01]
lr.intercept_: 664.1994881014358
R squared: 0.779


In [16]:
# Ridge Regression Code Block
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.01).fit(X_train, y_train)
print("Training set score: {:.3f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(ridge.coef_ != 0)))
print("ridge.coef_: {}".format(ridge.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(ridge, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 10
ridge.coef_: [ 2.36884983e-04 -9.66951839e-03 -1.05109406e-01 -3.63639497e-01
  2.23703012e-03 -1.48447285e+00  1.87274445e-03 -3.68401059e-01
  5.17171949e-01 -1.97451767e-01]
R squared: 0.779


In [17]:
# Lasso Regression Code Block
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.3f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
print("lasso.coef_: {}".format(lasso.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lasso, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.808
Number of features used: 9
lasso.coef_: [ 1.84493637e-04 -7.99498408e-03 -1.03868379e-01 -3.64418034e-01
  1.98214887e-03 -0.00000000e+00  1.86123551e-03 -3.50988030e-01
  5.16083603e-01 -1.96387846e-01]
R squared: 0.780


### Scaling Data using StandardScaler

When comparing the set scores (test and training) and the overall r^2 between the non-scaled and scaled results, some showed improvements, while others didn't. This makes sense because standard scaler conceptually transforms that data, where it has a mean of 0 and standard deviation of 1. In other words, it rearranges the data to a normal distribution. As such, it proves to be more effective for KNN than regression fitting, as demonstrated from the results below. In fact, it is better to normalize than using a standard scaler when dealing with regression. This is primarily because it normalizes the data to decrease large range and values. In this way, it can reduce numerical instabilities.

1) KNN Regressor
<br>
Trainining set score: 0.303 to 0.809
<br>
Test set score: 0.085 to 0.769
<br>
R sqaured: -0.074 to 0.709

2) Linear Regression
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.779

3) Ridge Regression
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.779

4) Lasso Regression
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.808 to 0.807
<br>
R squared: 0.780 to 0.782

In [18]:
# scaling the data using Standard Scalar
from sklearn import preprocessing
import numpy as np

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train.head(10)

Unnamed: 0,enrltot,teachers,calwpct,mealpct,computer,compstu,expnstu,str,avginc,elpct
3,243,14.0,36.475399,77.049202,85,0.349794,7101.831055,17.357143,8.978,0.0
18,6880,303.029999,21.2824,94.971199,960,0.139535,5064.615723,22.704023,7.022,77.005814
393,141,6.91,10.0719,9.3525,44,0.312057,6060.256836,20.40521,20.089001,0.0
60,723,37.119999,25.921101,83.157898,45,0.062241,4692.493652,19.477371,8.279,36.929462
203,307,15.85,5.5375,28.664499,36,0.117264,4718.163086,19.369085,14.578,7.491857
154,2019,102.779999,10.2526,40.564602,167,0.082714,5193.692383,19.6439,11.238,5.943536
63,3017,138.5,14.9768,56.063599,496,0.164402,4675.674805,21.783394,11.081,17.003647
110,205,11.2,18.5366,80.975601,24,0.117073,4895.439453,18.303572,8.258,41.463413
311,8432,360.5,6.3923,22.9839,0,0.0,5397.689453,23.389736,14.097667,4.969165
312,244,11.0,15.5738,40.163898,15,0.061475,5118.373047,22.181818,12.64,6.967213


In [19]:
# KNN Regressor Scaled Fit Code Block
knn_scaled = KNeighborsRegressor(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(knn_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(knn_scaled.score(X_test_scaled, y_test)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.809
Test set score: 0.769
R squared: 0.709


In [21]:
# Linear Regression Scaled Fit Code Block
from sklearn.linear_model import LinearRegression

lr_scaled = LinearRegression().fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(lr_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(lr_scaled.score(X_test_scaled, y_test)))
print("lr.coef_: {}".format(lr_scaled.coef_))
print("lr.intercept_: {}".format(lr_scaled.intercept_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lr_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
lr.coef_: [ 0.99068803 -1.93639614 -1.17333052 -9.77646224  1.04183669 -0.09712146
  1.15380605 -0.67884963  3.71960199 -3.4784803 ]
lr.intercept_: 654.2720646933902
R squared: 0.779


In [22]:
# Ridge Regression Scaled Fit Code Block
ridge_scaled = Ridge(alpha=0.01).fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(ridge_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(ridge_scaled.score(X_test_scaled, y_test)))
print("Number of features used: {}".format(np.sum(ridge_scaled.coef_ != 0)))
print("ridge.coef_: {}".format(ridge_scaled.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(ridge_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 10
ridge.coef_: [ 0.96617956 -1.90993807 -1.17423865 -9.77520551  1.03962552 -0.09676499
  1.15398941 -0.67778551  3.71966446 -3.47893586]
R squared: 0.779


In [23]:
# Lasso Regression Scaled Fit Code Block
lasso_scaled = Lasso(alpha=0.01, max_iter=100000).fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(lasso_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(lasso_scaled.score(X_test_scaled, y_test)))
print("Number of features used: {}".format(np.sum(lasso_scaled.coef_ != 0)))
print("lasso.coef_: {}".format(lasso_scaled.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lasso_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 9
lasso.coef_: [-0.         -0.68154738 -1.16302274 -9.79903359  0.76196085 -0.03307188
  1.14805701 -0.62654296  3.70821439 -3.46159671]
R squared: 0.782


### Hyper-tuning parameters using GridSearchCV

I have compared the results from the model when applying the GridSearch CV for both scaled and unscaled variables. Even though it proves to be more work, the comparison between both scaled and unscaled variables for different models will provide a more complete analysis. In terms of interpreting the brief results below, the output number is always compared to the initial unaltered original model results.

In regards to the actual results, KNN classification had a drastic improvement in its fit for the scaled output, again. However, the tuning of its parameters did not have much of an impact on the regression models in general, as lasso and ridge regressions already regularized its parameters, and have pushed its coefficients toward zero due to the mathematical algorithim of how regression and lasso regressions are computed.
<br>
<br>
1) KNN Regressor (Unnscaled, Optimized Parameters)
<br>
Trainining set score: 0.273 to 0.157
<br>
Test set score: 0.038 to 0.034
<br>
R squared: -0.271 to 0.022

KNN Regressor (Scaled, Optimized Parameters)
<br>
Trainining set score: 0.273 to 0.773
<br>
Test set score: 0.038 to 0.760
<br>
R squared: -0.271 to 0.728


2) Linear Regression (Unnscaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.779

Linear Regression (Scaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.779

3) Ridge Regression (Unnscaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.780

Ridge Regression (Scaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.807 to 0.807
<br>
R squared: 0.779 to 0.782

4) Lasso Regression (Unnscaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.805
<br>
Test set score: 0.808 to 0.807
<br>
R squared: 0.780 to 0.783

Lasso Regression (Scaled, Optimized Parameters)
<br>
Training set score: 0.805 to 0.797
<br>
Test set score: 0.808 to 0.786
<br>
R squared: 0.780 to 0.780

In [25]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1, 15, 2)}
grid = GridSearchCV(KNeighborsRegressor(), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)

print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))

Best mean cross-validation score: 0.021
Best parameters: {'n_neighbors': 13}
Test set score: 0.034




In [26]:
knn = KNeighborsRegressor(n_neighbors=13)
knn.fit(X_train, y_train)

print("Training set score: {:.3f}".format(knn.score(X_train, y_train)))
print("Test set score: {:.3f}".format(knn.score(X_test, y_test)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.157
Test set score: 0.034
R squared: 0.022


In [27]:
param_grid = {'n_neighbors': np.arange(1, 15, 2)}
grid = GridSearchCV(KNeighborsRegressor(), param_grid=param_grid, cv=10)
grid.fit(X_train_scaled, y_train)

print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Test set score: {:.3f}".format(grid.score(X_test_scaled, y_test)))

Best mean cross-validation score: 0.728
Best parameters: {'n_neighbors': 13}
Test set score: 0.760




In [28]:
knn_scaled = KNeighborsRegressor(n_neighbors=13)
knn_scaled.fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(knn_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(knn_scaled.score(X_test_scaled, y_test)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.773
Test set score: 0.760
R squared: 0.728


In [29]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(LinearRegression(),parameters, cv=10)
grid.fit(X_train, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))

Best mean cross-validation score: 0.779
Best parameters: {'copy_X': True, 'fit_intercept': True, 'normalize': True}
Test set score: 0.807




In [30]:
lr = LinearRegression(fit_intercept = True, normalize = True, copy_X = True).fit(X_train, y_train)

print("Training set score: {:.3f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lr.score(X_test, y_test)))
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lr, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
lr.coef_: [ 2.37299343e-04 -9.68465212e-03 -1.05122232e-01 -3.63632211e-01
  2.23981150e-03 -1.50110373e+00  1.87297506e-03 -3.68530746e-01
  5.17180694e-01 -1.97462843e-01]
lr.intercept_: 664.1994881014356
R squared: 0.779


In [31]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(LinearRegression(),parameters, cv=10)
grid.fit(X_train_scaled, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_params_))
print("Test set score: {:.3f}".format(grid.score(X_test_scaled, y_test)))

Best mean cross-validation score: 0.779
Best parameters: {'copy_X': True, 'fit_intercept': True, 'normalize': False}
Test set score: 0.807




In [32]:
lr_scaled = LinearRegression(fit_intercept = True, normalize = False, copy_X = True).fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(lr_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(lr_scaled.score(X_test_scaled, y_test)))
print("lr.coef_: {}".format(lr_scaled.coef_))
print("lr.intercept_: {}".format(lr_scaled.intercept_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lr_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
lr.coef_: [ 0.99068803 -1.93639614 -1.17333052 -9.77646224  1.04183669 -0.09712146
  1.15380605 -0.67884963  3.71960199 -3.4784803 ]
lr.intercept_: 654.2720646933902
R squared: 0.779


In [33]:
alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) #list of alpha values to test

grid = GridSearchCV(estimator=Ridge(max_iter=100000), param_grid=dict(alpha=alphas))
grid.fit(X_train, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_estimator_.alpha))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))

Best mean cross-validation score: 0.778
Best parameters: 1.0
Test set score: 0.807




In [34]:
ridge = Ridge(alpha=1.0, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.3f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(ridge.coef_ != 0)))
print("ridge.coef_: {}".format(ridge.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(ridge, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 10
ridge.coef_: [ 2.16651495e-04 -8.94419019e-03 -1.04514474e-01 -3.63985541e-01
  2.10678551e-03 -7.07149871e-01  1.86261011e-03 -3.61969722e-01
  5.16732038e-01 -1.96932369e-01]
R squared: 0.780


In [35]:
grid = GridSearchCV(estimator=Ridge(max_iter=100000), param_grid=dict(alpha=alphas))
grid.fit(X_train_scaled, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_estimator_.alpha))
print("Test set score: {:.3f}".format(grid.score(X_test_scaled, y_test)))

Best mean cross-validation score: 0.779
Best parameters: 1.0
Test set score: 0.807




In [36]:
ridge_scaled = Ridge(alpha=1.0, max_iter=100000).fit(X_train_scaled, y_train)
print("Training set score: {:.3f}".format(ridge_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(ridge_scaled.score(X_test_scaled, y_test)))
print("Number of features used: {}".format(np.sum(ridge_scaled.coef_ != 0)))
print("ridge.coef_: {}".format(ridge_scaled.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(ridge_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 10
ridge.coef_: [ 0.07269491 -0.92810378 -1.24237127 -9.65603891  0.94468825 -0.08099605
  1.15706149 -0.63971526  3.7388452  -3.52255519]
R squared: 0.782


In [37]:
grid = GridSearchCV(estimator=Lasso(max_iter=100000), param_grid=dict(alpha=alphas))
grid.fit(X_train, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_estimator_.alpha))
print("Test set score: {:.3f}".format(grid.score(X_test, y_test)))

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)


Best mean cross-validation score: 0.780
Best parameters: 1.0
Test set score: 0.807


  positive)


In [38]:
lasso = Lasso(alpha=1.0, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.3f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
print("lasso.coef_: {}".format(lasso.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lasso, X_train, y_train, cv=10, scoring="r2"))))

Training set score: 0.805
Test set score: 0.807
Number of features used: 8
lasso.coef_: [-3.49967757e-04  2.49295965e-03 -8.87575499e-02 -3.79258543e-01
  1.92082688e-03  0.00000000e+00  2.48221832e-03 -0.00000000e+00
  4.81687904e-01 -1.86583703e-01]
R squared: 0.783


In [39]:
grid = GridSearchCV(estimator=Lasso(max_iter=100000), param_grid=dict(alpha=alphas))
grid.fit(X_train_scaled, y_train)
print("Best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("Best parameters: {}".format(grid.best_estimator_.alpha))
print("Test set score: {:.3f}".format(grid.score(X_test_scaled, y_test)))

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)


Best mean cross-validation score: 0.780
Best parameters: 1.0
Test set score: 0.786


  positive)


In [40]:
lasso_scaled = Lasso(alpha=1.0, max_iter=100000).fit(X_train_scaled, y_train)
print("Training set score: {:.3f}".format(lasso_scaled.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(lasso_scaled.score(X_test_scaled, y_test)))
print("Number of features used: {}".format(np.sum(lasso_scaled.coef_ != 0)))
print("lasso.coef_: {}".format(lasso_scaled.coef_))
print("R squared: {:.3f}".format(np.mean(cross_val_score(lasso_scaled, X_train_scaled, y_train, cv=10, scoring="r2"))))

Training set score: 0.797
Test set score: 0.786
Number of features used: 5
lasso.coef_: [ -0.          -0.          -0.         -10.63996226  -0.
   0.           0.45852359  -0.22704534   3.32363204  -2.55679942]
R squared: 0.780


### Part 2 Introduction

This is a demonstration of supervised machine learning methods for a binary/categorical target variable. The data used for this section is imported from the following link, which features various variables measured for red and white wines. The target variable is "winetype", which is a new column that was created that labels the observation with either a 0 for white wine and 1 for red wine.

Data Source Link: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/Links to an external site.

In [44]:
df_red = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')
df_red['winetype'] = 1 # adding a new column and assigning 1 for all values

df_white = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white", sep=';')
df_white['winetype'] = 0 #adding a new column and assigning 0 for all values

df_combined = pd.concat([df_red, df_white], ignore_index=True) #combining both dataframes
df_combined.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,1
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,1
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,1


In [50]:
# splitting the data into train and test sets
y2 = df_combined['winetype']
X2 = df_combined.loc[:, df_combined.columns != 'winetype']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=42) 

X2.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [51]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1e90).fit(X_train2, y_train2)

print("logreg .coef_: {}".format(logreg.coef_))
print("Training set score: {:.3f}".format(logreg.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(logreg.score(X_test2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(logreg, X_train2, y_train2, cv=10, scoring="r2"))))



logreg .coef_: [[  1.43096916  13.17085779  -0.32841584  -0.12852306  36.51920195
    0.06606175  -0.07055161 -19.1782016    8.62450495   8.95890106
   -0.47389951   0.07426586]]
Training set score: 0.990
Test set score: 0.987




R squared: 0.938


In [52]:
logreg_l1 = LogisticRegression(C=100, penalty='l1', tol=0.01, solver='saga')
logreg_l2 = LogisticRegression(C=100, penalty='l2', tol=0.01, solver='saga')
logreg_el = LogisticRegression(C=100, penalty='elasticnet', solver='saga',
                                   l1_ratio=0.5, tol=0.01)

logreg_l1.fit(X_train2, y_train2)
logreg_l2.fit(X_train2, y_train2)
logreg_el.fit(X_train2, y_train2)

print("logreg .coef_: {}".format(logreg_l1.coef_))
print("Training set score: {:.3f}".format(logreg_l1.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l1.score(X_test2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l1, X_train2, y_train2, cv=10, scoring="r2"))))

print("logreg .coef_: {}".format(logreg_l2.coef_))
print("Training set score: {:.3f}".format(logreg_l2.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l2.score(X_test2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l2, X_train2, y_train2, cv=10, scoring="r2"))))

print("logreg .coef_: {}".format(logreg_el.coef_))
print("Training set score: {:.3f}".format(logreg_el.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(logreg_el.score(X_test2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(logreg_el, X_train2, y_train2, cv=10, scoring="r2"))))

logreg .coef_: [[ 0.42350109  0.08148562 -0.008986   -0.09347198  0.01388506  0.02725462
  -0.06039775  0.02776264  0.12456495  0.06096316  0.01292353  0.01470244]]
Training set score: 0.942
Test set score: 0.930
R squared: 0.674

logreg .coef_: [[ 0.42359309  0.08151318 -0.00909452 -0.09352192  0.01388649  0.02724537
  -0.06037324  0.02772096  0.12430436  0.06088905  0.01295755  0.01472626]]
Training set score: 0.942
Test set score: 0.930
R squared: 0.674

logreg .coef_: [[ 0.42005078  0.08018139 -0.00875093 -0.0925929   0.01362327  0.02722884
  -0.0604296   0.0275276   0.12305487  0.05972416  0.01569108  0.01510141]]
Training set score: 0.942
Test set score: 0.930
R squared: 0.675


In [53]:
from sklearn.neighbors import KNeighborsClassifier
knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train2, y_train2)

print("Training set score: {:.3f}".format(knn2.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(knn2.score(X_test2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn2, X_train2, y_train2, cv=10, scoring="r2"))))

Training set score: 0.969
Test set score: 0.936
R squared: 0.656


### Scaling Data using StandardScaler

When comparing the set scores (test and training) and the overall r^2 between the non-scaled and scaled results, all show improvements to varying extent. This makes sense because standard scaler conceptually transforms that data, where it has a mean of 0 and standard deviation of 1. In other words, it rearranges the data to a normal distribution. As such, it proves to be more effective for reuglarized models in this case (more signifciant improvement for the r squared value for penalized models and KNN classification), considering the target variable is binominal in natural (binary: 0 or 1).

1) Logistic Regression
<br>
Training set score: 0.990 to 0.994
<br>
Test set score: 0.987 to 0.991
<br>
R squared: 0.938 to 0.965

2) Penalized Logistic Regression L1
<br>
Training set score: 0.942 to 0.993
<br>
Test set score: 0.930 to 0.988
<br>
R squared: 0.675 to 0.959

3) Penalized Logistic Regression L2
<br>
Training set score: 0.942 to 0.993
<br>
Test set score: 0.930 to 0.989
<br>
R squared: 0.674 to 0.959

4) Penalized Logistic Regression Elasticnet
<br>
Training set score: 0.942 to 0.993
<br>
Test set score: 0.930 to 0.998
<br>
R squared: 0.675 to 0.959

5) KNN Classification
<br>
Trainining set score: 0.969 to 0.997
<br>
Test set score: 0.936 to 0.990
<br>
R squared: 0.656 to 0.964

In [54]:
scaler = preprocessing.StandardScaler().fit(X_train2)
X_train_scaled2 = scaler.transform(X_train2)
X_test_scaled2 = scaler.transform(X_test2)

In [55]:
logreg_scaled = LogisticRegression(C=1e90).fit(X_train_scaled2, y_train2)

print("logreg .coef_: {}".format(logreg_scaled.coef_))
print("Training set score: {:.3f}".format(logreg_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(logreg_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(logreg_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

logreg .coef_: [[ 0.23559078  1.58962069 -0.28637813 -4.93750147  0.79218326  1.29778593
  -3.18115237  4.53143041  0.02047138  0.62833527  1.51163703  0.34671209]]
Training set score: 0.994
Test set score: 0.991




R squared: 0.965




In [56]:
logreg_l1_scaled = LogisticRegression(C=100, penalty='l1', tol=0.01, solver='saga')
logreg_l2_scaled = LogisticRegression(C=100, penalty='l2', tol=0.01, solver='saga')
logreg_el_scaled = LogisticRegression(C=100, penalty='elasticnet', solver='saga', l1_ratio=0.5, tol=0.01)

logreg_l1_scaled.fit(X_train_scaled2, y_train2)
logreg_l2_scaled.fit(X_train_scaled2, y_train2)
logreg_el_scaled.fit(X_train_scaled2, y_train2)

print("logreg .coef_: {}".format(logreg_l1_scaled.coef_))
print("Training set score: {:.3f}".format(logreg_l1_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l1_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l1_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

print("logreg .coef_: {}".format(logreg_l2_scaled.coef_))
print("Training set score: {:.3f}".format(logreg_l2_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l2_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l2_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

print("logreg .coef_: {}".format(logreg_el_scaled.coef_))
print("Training set score: {:.3f}".format(logreg_el_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(logreg_el_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(logreg_el_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

logreg .coef_: [[ 1.07199337  1.56941883 -0.29799633 -2.28650117  0.9274505   0.80283647
  -2.94765746  2.11590939  0.71593491  0.86009786  0.41863605  0.24125075]]
Training set score: 0.993
Test set score: 0.989
R squared: 0.959

logreg .coef_: [[ 1.07307554  1.56911038 -0.30016438 -2.2972451   0.92716902  0.79109616
  -2.9362025   2.11685509  0.7166074   0.85891695  0.41915654  0.24339478]]
Training set score: 0.993
Test set score: 0.988
R squared: 0.959

logreg .coef_: [[ 1.06848513  1.5673305  -0.30069483 -2.30953016  0.92618916  0.7938456
  -2.9362368   2.1304068   0.71314783  0.857098    0.42663898  0.24396544]]
Training set score: 0.993
Test set score: 0.988
R squared: 0.959


In [57]:
knn2_scaled = KNeighborsClassifier(n_neighbors=3)
knn2_scaled.fit(X_train_scaled2, y_train2)

print("Training set score: {:.3f}".format(knn2_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(knn2_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn2_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

Training set score: 0.997
Test set score: 0.990
R squared: 0.964


### Hypertuning parameteres using GridSearchCV

Even though it was more work, I have applied the GridSearchCV to both scaled and unscaled variables for KNN classification and logistic regression. The results was then compared against the unaltered and unscaled original models. The code and the full process is shown below.

In terms of the result, even though the scaled model of KNN classification improved its r squared score by a significant amount, it is not a feasible model as the knn neighbor has only been optimized to be 1 as the best fit. The best logistic regression model proved to be the penalized model with a penalized term of L1 (Lasso) and a C value of 1. There was only a slight improvement in the fit of the model for unscaled variables when the GridSearchCv was applied.

1) KNN Classification (Unscaled, Optimized)
<br>
Trainining set score: 0.969 to 1.000
<br>
Test set score: 0.936 to 0.945
<br>
R squared: 0.656 to 0.695

KNN Classification (Scaled, Optimized)
<br>
Trainining set score: 0.969 to 0.999
<br>
Test set score: 0.936 to 0.991
<br>
R squared: 0.656 to 0.967

2) Penalized Logistic Regression (Unscaled, Optimized)
<br>
Training set score: 0.942 to 0.943
<br>
Test set score: 0.930 to 0.930
<br>
R squared: 0.675 to 0.683

Penalized Logistic Regression (Scaled, Optimized)
<br>
Training set score: 0.942 to 0.993
<br>
Test set score: 0.930 to 0.989
<br>
R squared: 0.675 to 0.959

In [59]:
param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid2 = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid2.fit(X_train2, y_train2)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid2.best_score_))
print("best parameters: {}".format(grid2.best_params_))
print("test-set score: {:.3f}".format(grid2.score(X_test2, y_test2)))

best mean cross-validation score: 0.945
best parameters: {'n_neighbors': 1}
test-set score: 0.945


In [60]:
knn2 = KNeighborsClassifier(n_neighbors=1)
knn2.fit(X_train2, y_train2)

print("Training set score:{:.3f}".format(knn2.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(knn2.score(X_test2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn2, X_train2, y_train2, cv=10, scoring="r2"))))

Training set score:1.000
Test set score: 0.945
R squared: 0.695


In [61]:
param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid2 = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid2.fit(X_train_scaled2, y_train2)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid2.best_score_))
print("best parameters: {}".format(grid2.best_params_))
print("test-set score: {:.3f}".format(grid2.score(X_test_scaled2, y_test2)))

best mean cross-validation score: 0.994
best parameters: {'n_neighbors': 1}
test-set score: 0.991


In [62]:
knn2_scaled = KNeighborsClassifier(n_neighbors=1)
knn2_scaled.fit(X_train_scaled2, y_train2)

print("Training set score: {:.3f}".format(knn2_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(knn2_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn2_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

Training set score: 0.999
Test set score: 0.991
R squared: 0.967


In [63]:
grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "penalty":["l1","l2"]}
logreg = LogisticRegression(tol=0.01, solver = 'saga')
logreg2 = GridSearchCV(logreg,grid,cv=10)
logreg2.fit(X_train2, y_train2)

print("Best paramters:",logreg2.best_params_)
print("R Squared:",logreg2.best_score_)

Best paramters: {'C': 0.01, 'penalty': 'l1'}
R Squared: 0.9423234811165846


In [64]:
logreg_l1 = LogisticRegression(C=0.01, penalty='l1', tol=0.01, solver='saga')

logreg_l1.fit(X_train2, y_train2)

print("logreg .coef_: {}".format(logreg_l1.coef_))
print("Training set score: {:.3f}".format(logreg_l1.score(X_train2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l1.score(X_test2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l1, X_train2, y_train2, cv=10, scoring="r2"))))

logreg .coef_: [[ 0.40564202  0.00506072  0.         -0.05206556  0.          0.02506152
  -0.05808551  0.          0.07222948  0.          0.02874361  0.        ]]
Training set score: 0.943
Test set score: 0.930
R squared: 0.683



In [65]:
grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "penalty":["l1","l2"]}
logreg = LogisticRegression(tol=0.01, solver = 'saga')
logreg2 = GridSearchCV(logreg,grid,cv=10)
logreg2.fit(X_train_scaled2, y_train2)

print("Best paramters:",logreg2.best_params_)
print("R Squared:",logreg2.best_score_)

Best paramters: {'C': 1, 'penalty': 'l1'}
R Squared: 0.9926108374384236


In [66]:
logreg_l1_scaled = LogisticRegression(C=1, penalty='l1', tol=0.01, solver='saga')

logreg_l1_scaled.fit(X_train_scaled2, y_train2)

print("logreg .coef_: {}".format(logreg_l1_scaled.coef_))
print("Training set score: {:.3f}".format(logreg_l1_scaled.score(X_train_scaled2, y_train2)))
print("Test set score: {:.3f}".format(logreg_l1_scaled.score(X_test_scaled2, y_test2)))
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(logreg_l1_scaled, X_train_scaled2, y_train2, cv=10, scoring="r2"))))

logreg .coef_: [[ 1.02841031  1.53760509 -0.26618522 -2.17365116  0.89247742  0.70690414
  -2.86530892  2.01951825  0.71118153  0.85540343  0.35822099  0.22243465]]
Training set score: 0.993
Test set score: 0.988
R squared: 0.959



### Assessing impact of cross-validation strategy in GridSearchCV from ‘stratified k-fold’ to ‘kfold’ with shuffling

In [68]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
# Set up function parameters for diff't cross validation strategies
kfold1 = KFold(n_splits=5, random_state=None, shuffle=False)
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X2, y2, cv=kfold1))))
#This improves the fit from 0.909 to a stratified k fold fit of 0.674

kfold2 = KFold(n_splits=5, random_state=None, shuffle=True)
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X2, y2, cv=kfold2))))
#By implementing the shuffle, the fit improved from 0.909 to 0.941

kfold3 = KFold(n_splits=5, random_state=3, shuffle=True)
print("R squared: {:.3f}\n".format(np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X2, y2, cv=kfold3))))
#By adding a random state of the shuffle, the fit improved in comparsion to without a shuffle, but stayed the same in relation to without a random state

X_train3, X_test3, y_train3, y_test3 = train_test_split(X2, y2, random_state=0)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X2, y2, random_state=1) 

knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train2, y_train2)
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn2, X_train2, y_train2, cv=10, scoring="r2"))))

knn3 = KNeighborsClassifier(n_neighbors=5)
knn3.fit(X_train3, y_train3)
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn3, X_train3, y_train3, cv=10, scoring="r2"))))
#By changing the random state from 42 to 0, it decreased the fit from 0.674 to 0.664

knn4 = KNeighborsClassifier(n_neighbors=5)
knn4.fit(X_train4, y_train4)
print("R squared: {:.3f}".format(np.mean(cross_val_score(knn4, X_train4, y_train4, cv=10, scoring="r2"))))
#By changing the random state from 42 to 1, it improved the fit from 0.674 to 0.683

R squared: 0.909

R squared: 0.941

R squared: 0.941

R squared: 0.674
R squared: 0.664
R squared: 0.683
