In [20]:
# Stacking Regression Using scikit-learn
from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
from sklearn import metrics

In [21]:
# Load the dataset
diabetes = load_diabetes()

In [22]:
# # Load the dataframe
df_features = pd.DataFrame(data = diabetes.data, columns = diabetes.feature_names)

df_target = pd.DataFrame(data = diabetes.target, columns = ['target'])


In [23]:
final = pd.concat([df_features, df_target], axis = 1)

final

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


### Save the data frame into csv file
final.to_csv('diabetes_new.csv', index = False)

final = pd.read_csv('diabetes_new.csv')

final
final.info()

In [24]:
X = np.array(final.iloc[:, :10]) # Predictors 

y = np.array(final['target']) # Target

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [26]:
# Base estimators

estimators = [("lr", RidgeCV()), ("svr", LinearSVR(random_state = 42))]

In [27]:
# Meta Model stacked on top of base estimators

reg = StackingRegressor(estimators = estimators,
                        final_estimator = RandomForestRegressor(n_estimators = 10,
                                                                random_state = 42))

In [28]:
stacking_reg = reg.fit(X_train, y_train)
stacking_reg

StackingRegressor(estimators=[('lr', RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                              ('svr', LinearSVR(random_state=42))],
                  final_estimator=RandomForestRegressor(n_estimators=10,
                                                        random_state=42))

In [29]:
# Save the ML model
pickle.dump(stacking_reg, open('stacking_reg_diabetes.pkl', 'wb'))

# Load the saved model
model = pickle.load(open('stacking_reg_diabetes.pkl','rb'))

In [30]:
pred = model.predict(X_test)

pred

array([136.7, 129.4, 141.2, 254.2, 174.8,  91.6, 271. , 185.6,  67.4,
       164.3,  99.7, 156.7,  52.9, 212.7,  93.8, 146.8, 198.2, 248.5,
       262.1, 221.9, 225.8,  74.3,  60.8, 189. , 128.8, 151.5, 231.9,
       113.7,  52.2, 147.6, 136.4, 111. , 186. , 123.3, 151.1, 189. ,
       174.8, 163.6, 136.7,  50.7,  60.4, 157.7, 144.5, 142.5, 151.1,
        52.9,  64.6,  93.8,  50.7, 199.1, 128.8,  52.9, 163.6,  97.3,
       195.7, 177.7,  84.7, 187.3, 147.6,  48.9, 123.3, 262.1, 141.2,
       166.9, 168. , 234.7, 140. , 144.5, 165.8, 136.7, 160.8, 262.1,
       212.5, 129.5,  67.4, 195.6, 245.1, 234.7, 162.4, 236.1, 147.6,
       138.6,  52.2,  52.2, 157.7,  67.4,  64.6,  47.9, 161.3, 185.6,
       127.8, 218.9,  93.8,  47.9,  50.7, 215.1, 276.1, 127. ,  92.6,
        52.9, 245.1, 157.7, 254.2,  99.7, 128.8,  89.2, 213.2, 169.1,
       151.5, 123.3, 147.6])

In [31]:
r2_score = model.score(X_test, y_test)

In [32]:
print(r2_score)

0.3642619780615395


In [33]:
test = pd.read_csv(r'C:\Users\Bharani Kumar\Desktop\Data Science using Python & R\Version 2 slides\stacking_regression_flask_new\stacking_regression_flask_new\diabetes_test.csv')

In [34]:
test_pred = model.predict(test)



In [35]:
test_pred

array([225.8,  91.8, 163.6, 164.3, 156.7,  93.8, 195.7, 262.1, 185.6,
       129.5, 147.6, 164.3, 111. , 147.6, 254.9, 161.3, 136.7,  74.3,
       151.1, 174.8, 183.5])