<a href="https://colab.research.google.com/github/MichaelMcCarey/SteelProject/blob/main/SteelProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# import dataset
steel = 'https://raw.githubusercontent.com/MichaelMcCarey/SteelProject/main/steel.csv'
steel_data = pd.read_csv(steel)

In [4]:
# check to make sure data was imported correctly
print(steel_data.head())
print(steel_data.shape)

   normalising_temperature  tempering_temperature  percent_silicon  \
0                  178.500                    275           0.1530   
1                  178.500                    950           0.1530   
2                  178.500                    375           0.1530   
3                  178.500                    900           0.1530   
4                  189.525                    900           0.1624   

   percent_chromium  percent_copper  percent_nickel  percent_sulphur  \
0          0.970575           0.942          0.8870              0.0   
1          1.212726           0.942          0.8870              0.0   
2          1.621165           0.942          0.8870              0.0   
3          0.809989           0.942          0.8870              0.0   
4          1.036229           0.849          0.9382              0.0   

   percent_carbon  percent_manganese  tensile_strength  
0           1.920                0.0         25.107613  
1           1.920               

In [5]:
# import k-fold cross validation and the two algorithims
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [6]:
# seperate the independent and dependent variables
dependent_cols = ['tensile_strength']
independent_cols = ['normalising_temperature', 'tempering_temperature', 'percent_silicon', 'percent_chromium', 'percent_copper', 'percent_nickel', 'percent_sulphur', 'percent_carbon', 'percent_manganese']

In [7]:
# set up matrix
x = steel_data[independent_cols]
y = steel_data[dependent_cols]
print(x.shape)
print(y.shape)

(553, 9)
(553, 1)


In [8]:
# setting up for 10 fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
# using r2 as the domain independent measure of error and RMSE as the domain specific measure of error
scoring = 'r2', 'neg_root_mean_squared_error'

In [56]:
# KNN model with default hyperparameters
model = KNeighborsRegressor()

In [57]:
# cross validate using kfold that was set up previously
knn_scores = cross_validate(model, x, y, cv=kfold, scoring=scoring, return_train_score=True)

In [58]:
# get the average scores for each measure of error for training and testing
knn_train_r2_score = knn_scores['train_r2'].mean()
knn_test_r2_score = knn_scores['test_r2'].mean()
# rmse will return a negative number so need to change the sign with a negative
knn_train_rmse_score = -knn_scores['train_neg_root_mean_squared_error'].mean()
knn_test_rmse_score = -knn_scores['test_neg_root_mean_squared_error'].mean()

In [13]:
print(knn_train_r2_score)
print(knn_test_r2_score)
print(knn_train_rmse_score)
print(knn_test_rmse_score)

0.7688815173859286
0.6176167029764584
43.75255009263816
54.25712410871235


In [93]:
# set up param grid for gridsearch
knn_param_grid = {'n_neighbors': range(1,50), 'weights': ['uniform', 'distance']}

In [98]:
# perform gridsearch
knn_gridsearch = GridSearchCV(estimator=model, param_grid=knn_param_grid, cv=kfold, scoring=scoring, refit = 'r2', return_train_score=True)

In [99]:
knn_gridsearch.fit(x, y)

In [103]:
#knn_best = knn_gridsearch.best_estimator_

In [104]:
#knn_best_scores = cross_validate(knn_best, x, y, cv=kfold, scoring=scoring, return_train_score=True)

In [105]:
#knn_best_train_r2_score = knn_best_scores['train_r2'].mean()
#knn_best_test_r2_score = knn_best_scores['test_r2'].mean()
#knn_best_train_rmse_score = -knn_best_scores['train_neg_root_mean_squared_error'].mean()
#knn_best_test_rmse_score = -knn_best_scores['test_neg_root_mean_squared_error'].mean()

In [106]:
#print(knn_best_train_r2_score)
#print(knn_best_test_r2_score)
#print(knn_best_train_rmse_score)
#print(knn_best_test_rmse_score)

1.0
0.6693172719771089
-0.0
50.44878313950988


In [51]:
# Decision tree model with default hyperparameters
model = DecisionTreeRegressor()

In [52]:
# cross validate using kfold that was set up previously
tree_scores = cross_validate(model, x, y, cv=kfold, scoring=scoring, return_train_score=True)

In [54]:
# get the average scores for each measure of error for training and testing
tree_train_r2_score = tree_scores['train_r2'].mean()
tree_test_r2_score = tree_scores['test_r2'].mean()
# rmse will return a negative number so need to change the sign with a negative
tree_train_rmse_score = -tree_scores['train_neg_root_mean_squared_error'].mean()
tree_test_rmse_score = -tree_scores['test_neg_root_mean_squared_error'].mean()

In [55]:
print(tree_train_r2_score)
print(tree_test_r2_score)
print(tree_train_rmse_score)
print(tree_test_rmse_score)

1.0
0.7976560393527121
4.416341248915534e-16
39.1592030701794


In [None]:
tree_param_grid = {'max_depth': [3, 5, 7, 9,11,99], 'min_samples_leaf': ['uniform', 'distance']}