### LINEAR REGRESSION

In [1]:
# import the libraries

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [27]:
# load the dataset

data = pd.read_csv(r'C:\Users\hp\Desktop\New folder\machine-learning-notes\data\insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
# split the dataset into X and y

X = data.drop(columns=['charges'])
y = data['charges']

In [28]:
# encode the categorical variables


label_encoders = {}
cat_cols = list(X.select_dtypes(include="object").columns)
for col in cat_cols:
    encoder = LabelEncoder() # init the encoder
    encoder.fit(X[col])
    X[col] = encoder.transform(X[col])
    label_encoders[col] = encoder

In [29]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [30]:
label_encoders

{'sex': LabelEncoder(), 'smoker': LabelEncoder(), 'region': LabelEncoder()}

In [42]:
# split the X

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                    random_state=23)

for output in [X_train, X_test, y_train, y_test]:
    print(output.shape)

(1137, 6)
(201, 6)
(1137,)
(201,)


In [48]:
# scale the dataset

# init the scaler
scaler = StandardScaler()
column_names = list(X.columns)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train = pd.DataFrame(data=X_train, columns = column_names)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(data=X_test, columns = column_names)


### Train the model

In [50]:
# train the model

model = LinearRegression() # init the model
model.fit(X_train, y_train)  # fit the model to the data
train_preds = model.predict(X_train) # get training prediction
test_preds = model.predict(X_test) # get test prediction


### Model Evaluation

In [51]:
train_rmse = root_mean_squared_error(y_true=y_train, y_pred=train_preds)
test_rmse = root_mean_squared_error(y_true=y_test, y_pred=test_preds)

print(f'train rmse: {train_rmse}')
print(f'test rmse: {test_rmse}')

train rmse: 6047.789065106405
test rmse: 6051.915976205498
