In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes

In [3]:
data = load_diabetes()
dir(data)

['DESCR',
 'data',
 'data_filename',
 'data_module',
 'feature_names',
 'frame',
 'target',
 'target_filename']

In [15]:
print(data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [5]:
df = pd.DataFrame(data.data , columns = data.feature_names)
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [6]:
df["Output"] = data.target
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Output
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [16]:
X = df.drop("Output",axis=1)
y = df["Output"]

In [17]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=10)


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score , mean_squared_error , mean_absolute_error 

In [20]:
Linear  = LinearRegression()
Linear.fit(X_train , y_train)

In [23]:
DT = DecisionTreeRegressor()
DT.fit(X_train , y_train)

In [24]:
RF = RandomForestRegressor()
RF.fit(X_train , y_train)

In [25]:
SVM = SVC()
SVM.fit(X_train , y_train)

In [26]:
KNN = KNeighborsRegressor()
KNN.fit(X_train , y_train)

In [36]:
y_pred_train = Linear.predict(X_train)
y_pred_test = Linear.predict(X_test)

In [37]:
y_pred_train_DT = DT.predict(X_train)
y_pred_test_DT = DT.predict(X_test)

In [38]:
y_pred_train_RF = RF.predict(X_train)
y_pred_test_RF = RF.predict(X_test)

In [39]:
y_pred_train_SVM = SVM.predict(X_train)
y_pred_test_SVM = SVM.predict(X_test)

In [40]:
y_pred_train_KNN = KNN.predict(X_train)
y_pred_test_KNN = KNN.predict(X_test)

In [32]:
def model_eval(actual,pred):
#     pred = model.predict(actual)
    mse = mean_squared_error(actual,pred)
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    print("MSE",mse)
    print("MAE",mae)
    print("R2",r2)
    return "Succesful"
    

 ## Evaluation

In [35]:
model_eval(y_train,y_pred_train)

MSE 2839.1212261693076
MAE 43.218399782544374
R2 0.5112345828164674


'Succesful'

In [45]:
model_eval(y_test,y_pred_test)

MSE 2962.6982797601713
MAE 43.717605985870485
R2 0.5282320385429604


'Succesful'

In [41]:
model_eval(y_test,y_pred_train_DT)

MSE 0.0
MAE 0.0
R2 1.0


'Succesful'

In [46]:
model_eval(y_test,y_pred_test_DT)

MSE 6419.954954954955
MAE 60.207207207207205
R2 -0.022287379864510903


'Succesful'

In [42]:
model_eval(y_train,y_pred_train_RF)

MSE 486.73754501510575
MAE 17.767915407854986
R2 0.9162062975489128


'Succesful'

In [47]:
model_eval(y_test,y_pred_test_RF)

MSE 3242.2093603603603
MAE 45.935135135135134
R2 0.4837238368133939


'Succesful'

In [43]:
model_eval(y_train,y_pred_train_KNN)

MSE 2275.757945619335
MAE 36.960120845921445
R2 0.6082196943734723


'Succesful'

In [48]:
model_eval(y_test,y_pred_test_KNN)

MSE 3868.1351351351345
MAE 48.672072072072076
R2 0.3840539754555181


'Succesful'

In [44]:
model_eval(y_train,y_pred_train_SVM)

MSE 4233.042296072507
MAE 43.91238670694864
R2 0.2712658181958033


'Succesful'

In [49]:
model_eval(y_test,y_pred_test_SVM)

MSE 6463.72972972973
MAE 61.78378378378378
R2 -0.029257896032093278


'Succesful'

### Hypertunning to RF

In [56]:
from sklearn.model_selection import GridSearchCV ,RandomizedSearchCV
RF = RandomForestRegressor(random_state=10)
RF.fit(X_train , y_train)
hyp = {'n_estimators':np.arange(30,100,10),
       'criterion':['squared_error','absolute_error'],
    'max_depth': np.arange(5,10,2),
    'min_samples_split':np.arange(2,10,2),
    'min_samples_leaf':np.arange(2,10,2)}

In [57]:
hyp_model = RandomizedSearchCV(RF , hyp ,cv=5)
hyp_model

In [54]:
hyp_model.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=10),
             param_grid={'criterion': ['squared_error', 'absolute_error'],
                         'max_depth': array([5, 6, 7, 8, 9]),
                         'min_samples_leaf': array([2, 3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                         'n_estimators': array([  1,  11,  21,  31,  41,  51,  61,  71,  81,  91, 101, 111, 121,
       131, 141, 151, 161, 171, 181, 191, 201, 211, 221, 231, 241])})>

In [58]:
hyp_model.fit(X_train , y_train)


In [59]:
hyp_model.best_estimator_

In [94]:
last_model = hyp_model.best_estimator_
last_model.fit(X_train , y_train)

In [65]:
y_pred_train_last = last_model.predict(X_train)
y_pred_test_last = last_model.predict(X_test)

In [66]:
model_eval(y_train,y_pred_train_last)

MSE 2055.0449445971017
MAE 37.276236542328604
R2 0.6462162691685593


'Succesful'

In [68]:
model_eval(y_test,y_pred_test_last)

MSE 3227.170746226628
MAE 46.73542871523484
R2 0.4861185242446041


'Succesful'

In [88]:
y.head(1)

0    151.0
Name: Output, dtype: float64

### User Define Function

In [87]:
user_input = X.head()
pred = last_model.predict(user_input)
pred





array = np.zeros(X.shape[1])
array.shape
array

array[0] = 0.038076
array[1] = 0.05068
array[2] = 0.061696
array[3] = 0.021872
array[4] = -0.044223
array[5] = -0.034821
array[6] = -0.043401
array[7] = -0.002592
array[8] = 0.019907
array[9] = -0.017646

# columns_list = X.columns
# brand_value = 'brand_name_' + brand_name
# brand_index = np.where(columns_list==brand_value)[0][0]
# array[brand_index] = 1

# seller_value = 'seller_type_' + seller
# seller_type_index = np.where(columns_list==seller_value)[0][0]
# array[seller_type_index]=1

# fuel_value = 'fuel_type_' + fuel 
# fuel_type_index = np.where(columns_list==fuel_value)[0][0]
# array[fuel_type_index] = 1

# transmission_value = 'transmission_type_' + transmission 
# transmission_index = np.where(columns_list==transmission_value)[0][0]
# array[transmission_index] = 1

# array


pred = last_model.predict([array])
pred



array([194.78416255])

In [73]:
array = np.zeros(X.shape[1])
array

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [81]:
columns_list = X.columns
columns_list = list(columns_list)
columns_list
col = {'columns':columns_list}
col

{'columns': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']}

In [89]:
import json 
with open('columns_list.json','w') as file:
    json.dump(col,file)

In [92]:
import pickle
with open("model1.pkl","wb") as file:
    file.last_model()
    

AttributeError: '_io.BufferedWriter' object has no attribute 'last_model'

In [98]:
import pickle
with open("model2.pkl","wb") as file:
    pickle.dump(last_model,file)
    
     

In [99]:
import pickle
with open("model2.pkl","rb") as file:
    data  = pickle.load(file)
data