<a href="https://colab.research.google.com/github/MichaelMcCarey/SteelProject/blob/main/SteelProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# import dataset
steel = 'https://raw.githubusercontent.com/MichaelMcCarey/SteelProject/main/steel.csv'
steel_data = pd.read_csv(steel)

In [3]:
# check to make sure data was imported correctly
print(steel_data.head())
print(steel_data.shape)

   normalising_temperature  tempering_temperature  percent_silicon  \
0                  178.500                    275           0.1530   
1                  178.500                    950           0.1530   
2                  178.500                    375           0.1530   
3                  178.500                    900           0.1530   
4                  189.525                    900           0.1624   

   percent_chromium  percent_copper  percent_nickel  percent_sulphur  \
0          0.970575           0.942          0.8870              0.0   
1          1.212726           0.942          0.8870              0.0   
2          1.621165           0.942          0.8870              0.0   
3          0.809989           0.942          0.8870              0.0   
4          1.036229           0.849          0.9382              0.0   

   percent_carbon  percent_manganese  tensile_strength  
0           1.920                0.0         25.107613  
1           1.920               

In [5]:
# import k-fold cross validation and the two algorithims
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [6]:
# seperate the independent and dependent variables
dependent_cols = ['tensile_strength']
independent_cols = ['normalising_temperature', 'tempering_temperature', 'percent_silicon', 'percent_chromium', 'percent_copper', 'percent_nickel', 'percent_sulphur', 'percent_carbon', 'percent_manganese']

In [7]:
# set up matrix
x = steel_data[independent_cols]
y = steel_data[dependent_cols]
print(x.shape)
print(y.shape)

(553, 9)
(553, 1)


In [8]:
# setting up for 10 fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
# using r2 as the domain independent measure of error and RMSE as the domain specific measure of error
scoring = 'r2', 'neg_root_mean_squared_error'

In [75]:
# KNN model with default hyperparameters
# need to use pipeline with standard scaling because knn is much more susceptible to the effects of relative scale
knn_pipeline = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsRegressor())])

In [76]:
# cross validate using kfold that was set up previously
knn_scores = cross_validate(knn_pipeline, x, y, cv=kfold, scoring=scoring, return_train_score=True)

In [77]:
# get the average scores for each measure of error for training and testing
knn_train_r2_score = knn_scores['train_r2'].mean()
knn_test_r2_score = knn_scores['test_r2'].mean()
# rmse will return a negative number so need to change the sign with a negative
knn_train_rmse_score = -knn_scores['train_neg_root_mean_squared_error'].mean()
knn_test_rmse_score = -knn_scores['test_neg_root_mean_squared_error'].mean()

In [12]:
print(knn_train_r2_score)
print(knn_test_r2_score)
print(knn_train_rmse_score)
print(knn_test_rmse_score)

0.857842272435416
0.7658132198063687
34.3076604870604
42.65590640074778


In [78]:
# set up param grid for gridsearch
knn_param_grid = {'knn__n_neighbors': range(1,20), 'knn__weights': ['uniform', 'distance']} # need to use __ when using pipeline to fix error message

In [86]:
# perform gridsearch
knn_gridsearch = GridSearchCV(estimator=knn_pipeline, param_grid=knn_param_grid, cv=kfold, scoring=scoring, refit = 'r2', return_train_score=True) # used refit so that the scoring could keep both metrics

In [88]:
knn_gridsearch.fit(x, y)

In [89]:
print(knn_gridsearch.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('knn',
                 KNeighborsRegressor(n_neighbors=7, weights='distance'))])


In [90]:
# now want to find the test score for rmse training scores for both when using the best parameters
knn_best = knn_gridsearch.best_index_
knn_best_test_r2 = knn_gridsearch.best_score_
knn_best_train_r2 = knn_gridsearch.cv_results_['mean_train_r2'][knn_best]
knn_best_test_rmse = -knn_gridsearch.cv_results_['mean_test_neg_root_mean_squared_error'][knn_best]
knn_best_train_rmse = -knn_gridsearch.cv_results_['mean_train_neg_root_mean_squared_error'][knn_best]

In [91]:
print(knn_best_train_r2)
print(knn_best_test_r2)
print(knn_best_train_rmse)
print(knn_best_test_rmse)

1.0
0.8030817784218238
-0.0
39.203329591482074


In [92]:
# Decision tree model with default hyperparameters
model = DecisionTreeRegressor(random_state =42) # passed this in because gridsearch was giving different values for maxdepth every time

In [93]:
# cross validate using kfold that was set up previously
tree_scores = cross_validate(model, x, y, cv=kfold, scoring=scoring, return_train_score=True)

In [94]:
# get the average scores for each measure of error for training and testing
tree_train_r2_score = tree_scores['train_r2'].mean()
tree_test_r2_score = tree_scores['test_r2'].mean()
# rmse will return a negative number so need to change the sign with a negative
tree_train_rmse_score = -tree_scores['train_neg_root_mean_squared_error'].mean()
tree_test_rmse_score = -tree_scores['test_neg_root_mean_squared_error'].mean()

In [95]:
print(tree_train_r2_score)
print(tree_test_r2_score)
print(tree_train_rmse_score)
print(tree_test_rmse_score)

1.0
0.778725028935884
1.7652056078443961e-15
40.6944843021363


In [96]:
# set up param grid
tree_param_grid = {'max_depth': list(range(1,16)) + [None], 'min_samples_leaf': range(1,11)} # have to use list to include integer range and None

In [97]:
# perform gridsearch
tree_gridsearch = GridSearchCV(estimator=model, param_grid=tree_param_grid, cv=kfold, scoring=scoring, refit = 'r2', return_train_score=True)
#tree_gridsearch = GridSearchCV(estimator=model, param_grid=tree_param_grid, cv=kfold, scoring=scoring, refit = 'neg_root_mean_squared_error', return_train_score=True)

In [98]:

tree_gridsearch.fit(x, y)

In [105]:
print(tree_gridsearch.best_estimator_)

DecisionTreeRegressor(max_depth=13, min_samples_leaf=2, random_state=42)


In [103]:
# now want to find the test score for rmse training scores for both when using the best parameters
tree_best = tree_gridsearch.best_index_
tree_best_test_r2 = tree_gridsearch.best_score_
tree_best_train_r2 = tree_gridsearch.cv_results_['mean_train_r2'][tree_best]
tree_best_test_rmse = -tree_gridsearch.cv_results_['mean_test_neg_root_mean_squared_error'][tree_best]
tree_best_train_rmse = -tree_gridsearch.cv_results_['mean_train_neg_root_mean_squared_error'][tree_best]

In [104]:
print(tree_best_train_r2)
print(tree_best_test_r2)
print(tree_best_train_rmse)
print(tree_best_test_rmse)

0.9828991475806441
0.8279012982226763
11.867159683420342
36.370976549347986
