In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [7]:
region_map = {'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4}
df['region'] = df['region'].map(region_map)
df['region'].unique()

array([1, 2, 3, 4])

In [8]:
gender = {'female':0, 'male':1}
df['sex'] = df['sex'].map(gender)
df['sex'].unique()

array([0, 1])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [10]:
smoker_people = {'yes':1, 'no':0}
df['smoker'] = df['smoker'].map(smoker_people)
df['smoker'].unique()

array([1, 0])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


In [12]:
X = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y = df[['charges']]

In [13]:
print(X)

      age  sex     bmi  children  smoker  region
0      19    0  27.900         0       1       1
1      18    1  33.770         1       0       2
2      28    1  33.000         3       0       2
3      33    1  22.705         0       0       3
4      32    1  28.880         0       0       3
...   ...  ...     ...       ...     ...     ...
1333   50    1  30.970         3       0       3
1334   18    0  31.920         0       0       4
1335   18    0  36.850         0       0       2
1336   21    0  25.800         0       0       1
1337   61    0  29.070         0       1       3

[1338 rows x 6 columns]


In [14]:
print(y)

          charges
0     16884.92400
1      1725.55230
2      4449.46200
3     21984.47061
4      3866.85520
...           ...
1333  10600.54830
1334   2205.98080
1335   1629.83350
1336   2007.94500
1337  29141.36030

[1338 rows x 1 columns]


In [15]:
## Splitting Data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [16]:
## Lets preprocess the data to prevent the leakage. Also the test data will remain unseen during training
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
num_features = X.columns
preprocessor = ColumnTransformer([("num", StandardScaler(), num_features)])

In [17]:
##fit the model. In this case we will use SGDRegressor
from sklearn.linear_model import SGDRegressor
model = SGDRegressor(random_state=42)

In [18]:
##Train the data within a pipeline.
from sklearn.pipeline import Pipeline
pipeline = Pipeline([("preprocessing", preprocessor), ("model", model)])
pipeline.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
##Evaluate - check metrics, accuracy score etc.
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("R2 Score:", r2_score(y_test, y_pred))

RMSE: 5799.681546519509
R2 Score: 0.7833392536104471


In [20]:
#hyperparameters tuning and utilizing Grid Search CV to find out the best parameter
from sklearn.model_selection import GridSearchCV
param_grid = {"model__alpha":[0.0001, 0.001, 0.01], "model__max_iter":[1000, 2000], "model__learning_rate":["constant", "adaptive", "optimal"],
              "model__eta0":[0.01, 0.1]}

In [21]:
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring = "neg_root_mean_squared_error", n_jobs = -1)
grid.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [22]:
##finding the best parameter after tuning
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print(y_pred)

[ 8929.793147    7127.76957808 36879.2925266   9506.55724465
 26991.98427655 10796.7882235    242.18615323 16942.20682035
  1069.933132   11265.90032805 28018.13838308  9426.43771055
  5337.79343751 38426.14732506 40270.78094041 37114.41624756
 15292.63868551 35938.61592703  9185.8057921  31487.00068446
  3798.38302784 10072.06511839  2323.96094976  7090.46269219
 11354.54867165 12914.50849177 14453.45890155  6214.39478219
  9921.90072394  2251.33966677  9063.4594225  13126.23231925
  4633.43200492  3475.31172008  4410.39089465 12973.01120758
  1940.82330483  8765.29158423 33296.10552021 32610.55205751
  3859.41828534  4379.59649264 14084.7666526  11478.9389801
  8833.63766484 12044.42047853  5335.1360101   3109.34779574
 35513.23247535  9206.67396857 15898.25080587  2416.08235903
 12406.65140153  1445.92721332 13449.52549308 12519.34962784
  4303.91071669 32172.78341642 13268.24979168 12849.15507893
 14115.5514229  10561.66309868 16292.76102178  7824.18828314
 11794.78598871  4125.033

In [23]:
print("RMSE:", rmse)
print("R2 Score:", r2_score(y_test, y_pred))

RMSE: 5799.681546519509
R2 Score: 0.7832881085151819


In [24]:
print("Best Parameters:", grid.best_params_)

Best Parameters: {'model__alpha': 0.001, 'model__eta0': 0.01, 'model__learning_rate': 'adaptive', 'model__max_iter': 1000}
