<a href="https://colab.research.google.com/github/HunterVinic/Machine-Learning-with-Python/blob/main/GridSearchCV_Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("/content/drive/MyDrive/datamining/50_Startups.csv")
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,NewYork,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,NewYork,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
data.isnull().sum()

R&D Spend          3
Administration     0
Marketing Spend    4
State              4
Profit             0
dtype: int64

In [4]:
data['R&D Spend'].fillna(data['R&D Spend'].mean(), inplace= True)
data['Marketing Spend'].fillna(data['Marketing Spend'].mean(), inplace= True)
data['State'].fillna(data['State'].mode()[0], inplace= True) #For categorical value use mode

In [5]:
data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [6]:
data = pd.get_dummies(data, columns = ['State'], drop_first = True)
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_NewYork
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [10]:
from sklearn.model_selection import train_test_split
X= data.drop('Profit', axis = 1)
y= data['Profit']

X_train, X_test, y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)

(40, 5)
(10, 5)


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Initalize all the models
lr= LinearRegression();
dtr = DecisionTreeRegressor(random_state = 42);
rfr = RandomForestRegressor(random_state = 42);
gbr = GradientBoostingRegressor(random_state = 42);
svm = SVR();
knn = KNeighborsRegressor()

#List of all models
models = [lr,dtr,rfr,gbr,svm,knn]
model_names =['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','SVM','KNN']
errors= []
r2_scores = []

for model in models:
  try:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = round(r2_score(y_test, y_pred)*100, 2)
    r2_scores.append(r2)
    errors.append(None)
  except Exception as e:
    r2_scores.append(None)
    errors.append(str(e))

results = pd.DataFrame({
    "model": model_names,
    "R2_Score":r2_scores,
    "Errors": errors
})
results.sort_values(by='R2_Score', ascending = False)

Unnamed: 0,model,R2_Score,Errors
0,Linear Regression,69.06,
5,KNN,43.07,
2,Random Forest,42.29,
3,Gradient Boosting,10.69,
4,SVM,-18.0,
1,Decision Tree,-68.75,


In [24]:
from sklearn.model_selection import GridSearchCV

#Hyper paramaters for Decision Tree
param_grid_dtr= {
    'max_depth':[None, 3,5,7,9],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,2,3],
    'criterion':['mse','friedman_mse','mae']
}

#Grid Search CV for decision tree
grid_dtr = GridSearchCV(dtr, param_grid_dtr, cv = 5, scoring = 'r2', n_jobs = -1)
grid_dtr.fit(X_train, y_train)

#Best parameters
best_params_tree = grid_dtr.best_params_
best_score_tree = grid_dtr.best_score_

best_params_tree, best_score_tree

({'criterion': 'friedman_mse',
  'max_depth': None,
  'min_samples_leaf': 3,
  'min_samples_split': 2},
 0.7471780462562109)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Initalize all the models
lr= LinearRegression();
dtr = DecisionTreeRegressor(criterion='friedman_mse',max_depth=None,
                            min_samples_leaf=3, min_samples_split= 2);
rfr = RandomForestRegressor(random_state = 42);
gbr = GradientBoostingRegressor(random_state = 42);
svm = SVR();
knn = KNeighborsRegressor()

#List of all models
models = [lr,dtr,rfr,gbr,svm,knn]
model_names =['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','SVM','KNN']
errors= []
r2_scores = []

for model in models:
  try:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = round(r2_score(y_test, y_pred)*100, 2)
    r2_scores.append(r2)
    errors.append(None)
  except Exception as e:
    r2_scores.append(None)
    errors.append(str(e))

results = pd.DataFrame({
    "model": model_names,
    "R2_Score":r2_scores,
    "Errors": errors
})
results.sort_values(by='R2_Score', ascending = False)

Unnamed: 0,model,R2_Score,Errors
0,Linear Regression,69.06,
5,KNN,43.07,
2,Random Forest,42.29,
1,Decision Tree,29.45,
3,Gradient Boosting,10.69,
4,SVM,-18.0,
