### LINEAR REGRESSION

In [36]:
# import the libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

# customize the chat template
sns.set_style('darkgrid')
sns.set_palette('bright')

In [5]:
# load the dataset
data = pd.read_csv(r'C:\Users\hp\Desktop\New folder\machine-learning-notes\data\insurance.csv')
data.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
# encode the categorical variables

cat_cols = list(data.select_dtypes(include="object").columns)
for column in cat_cols:
    encoder = LabelEncoder()
    data[column] = encoder.fit_transform(data[column])

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [22]:
# split the dataset into train set and test set

X = data.drop(columns=['charges'])
y = data['charges']

# do train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.25,
                                                    random_state=23)


In [23]:
for split in [X_train, X_test, y_train, y_test]:
    print(split.shape)

(1003, 6)
(335, 6)
(1003,)
(335,)


In [28]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
326,27,0,23.21,1,0,2
1021,22,0,31.02,3,1,2
1057,45,0,31.79,0,0,2
197,45,0,28.6,2,0,2
203,27,0,36.08,0,1,2


In [32]:
# scale the data
scaler = StandardScaler() # init the scaler
columns = X_train.columns
scaler.fit(X_train) #scale the train set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

In [33]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-0.903261,-1.019126,-1.231924,-0.070193,-0.516153,0.464780
1,-1.257294,-1.019126,0.023446,1.567091,1.937409,0.464780
2,0.371258,-1.019126,0.147215,-0.888834,-0.516153,0.464780
3,0.371258,-1.019126,-0.365542,0.748449,-0.516153,0.464780
4,-0.903261,-1.019126,0.836784,-0.888834,1.937409,0.464780
...,...,...,...,...,...,...
998,-1.540521,-1.019126,-0.732830,-0.888834,-0.516153,-1.352663
999,1.220938,0.981233,-2.015526,-0.888834,-0.516153,-1.352663
1000,-0.761648,-1.019126,-0.847758,2.385732,-0.516153,1.373501
1001,0.937711,0.981233,0.519325,-0.888834,1.937409,-1.352663


In [34]:
# train the model

model = LinearRegression() #init the model
model.fit(X_train, y_train) #fit the model to the data
train_prediction = model.predict(X_train) # get train prediction
test_prediction = model.predict(X_test) # get test prediction

In [38]:
# evaluate the model

train_rmse = root_mean_squared_error(y_train, train_prediction)
test_rmse = root_mean_squared_error(y_test, test_prediction)

print(f'test rmse: {test_rmse}')
print(f'train rmse: {train_rmse}')

test rmse: 4915.255650254305
train rmse: 1849.9012174009172
