In [69]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [70]:
insurance = pd.read_csv("C:\Aalesh and Mandar\Datasets\insurance.csv")

In [71]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [72]:
insurance = pd.get_dummies(insurance, drop_first = True)

In [73]:
insurance.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [74]:
x = insurance.drop("charges", axis=1)
y = insurance["charges"]

In [75]:
lr = LinearRegression()
scalar = StandardScaler()

In [76]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2022)

In [77]:
result = cross_val_score(lr, x, y, cv = kfold, scoring = 'r2')

In [78]:
print(result.mean())

0.7440038506879969


## KNN pipeline

In [79]:
knn = KNeighborsRegressor()
scalar = StandardScaler()
pipe = Pipeline([('STD',scalar),('KNN',knn)])

In [80]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2022)
params = {'KNN__n_neighbors':np.arange(1,16)}
knn = KNeighborsRegressor()

In [81]:
gcv = GridSearchCV(pipe,param_grid=params,scoring='r2',cv=kfold)
gcv.fit(x,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 7}
0.7902611200269143


In [84]:
knn =KNeighborsRegressor(n_neighbors=7)
pipe = Pipeline([('STD',scalar),('KNN',knn)])
pipe.fit(x,y)

Pipeline(steps=[('STD', StandardScaler()),
                ('KNN', KNeighborsRegressor(n_neighbors=7))])

In [97]:
tst_insure = pd.read_csv(r"C:\Aalesh and Mandar\Datasets\tst_insure.csv")
dum_tst = pd.get_dummies(tst_insure,drop_first=True)
print(x.dtypes)
print(dum_tst.dtypes)
predictions = pipe.predict(dum_tst)
predictions

age                   int64
bmi                 float64
children              int64
sex_male              uint8
smoker_yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object
age                   int64
bmi                 float64
children              int64
sex_male              uint8
smoker_yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object


array([24411.94929   ,  5270.08306429,  4680.98034286,  6808.30289429,
        6808.34575   ,  7670.527     ,  1902.37384286,  2453.48372857,
       34845.44309286, 10229.51452143,  3275.05149286, 24411.94929   ,
       10304.84671429, 22843.14585714,  6653.04271429, 12392.16091429,
       12966.40537857, 15364.97829   ,  1921.51835   ,  5741.935917  ,
        1930.68848571,  4496.99419   ,  1921.51835   , 15434.37549143,
        3363.35190429, 12532.92425857,  6900.08065714, 22101.33436714,
        6251.48125   , 28107.11708429,  5386.91035714, 39300.07997286,
       11911.19909429, 12579.67512714,  1760.93657143,  5274.64907143,
       12669.75224143, 11614.55771143, 20496.56267857,  8816.08189143,
       11979.46763714, 29567.18479429, 21407.89491143, 34833.36007143,
       14687.10969143, 27513.17474286, 34469.17022857, 10360.09185714,
        7323.58244714,  4683.26036429,  9444.90332143,  6725.23610429,
        7400.81874286, 11273.45092857, 15676.58758   ,  2374.16768571,
      

In [104]:
pd_cv = pd.DataFrame(gcv.cv_results_)
best_model = gcv.best_estimator_
tst_insure = pd.read_csv(r"C:\Aalesh and Mandar\Datasets\tst_insure.csv")
dum_tst = pd.get_dummies(tst_insure,drop_first=True)
predictions = best_model.predict(dum_tst)
predictions

array([24411.94929   ,  5270.08306429,  4680.98034286,  6808.30289429,
        6808.34575   ,  7670.527     ,  1902.37384286,  2453.48372857,
       34845.44309286, 10229.51452143,  3275.05149286, 24411.94929   ,
       10304.84671429, 22843.14585714,  6653.04271429, 12392.16091429,
       12966.40537857, 15364.97829   ,  1921.51835   ,  5741.935917  ,
        1930.68848571,  4496.99419   ,  1921.51835   , 15434.37549143,
        3363.35190429, 12532.92425857,  6900.08065714, 22101.33436714,
        6251.48125   , 28107.11708429,  5386.91035714, 39300.07997286,
       11911.19909429, 12579.67512714,  1760.93657143,  5274.64907143,
       12669.75224143, 11614.55771143, 20496.56267857,  8816.08189143,
       11979.46763714, 29567.18479429, 21407.89491143, 34833.36007143,
       14687.10969143, 27513.17474286, 34469.17022857, 10360.09185714,
        7323.58244714,  4683.26036429,  9444.90332143,  6725.23610429,
        7400.81874286, 11273.45092857, 15676.58758   ,  2374.16768571,
      