# Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

# Load data

In [2]:
df = pd.read_csv('Data.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.885
1,18,male,33.77,1,no,southeast,1.726
2,28,male,33.0,3,no,southeast,4.449
3,33,male,22.705,0,no,northwest,21.984
4,32,male,28.88,0,no,northwest,3.867


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


There is no null data in the dataset.

# A. Averaging

In [4]:
y = df['charges']
X = df.drop('charges', axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
y_pred = np.mean(y_train)
round(y_pred,2)

13.38

In [7]:
mse = mean_squared_error(y_test, y_pred*np.ones(len(y_test)))
print('MSE of test data for mean estimator: {0:.2f}'.format(mse))

MSE of test data for mean estimator: 146.75


# B. Linear regression

In [8]:
num_cols = [i for i in df.columns if df.dtypes[i] != "object"]
cat_cols = [i for i in df.columns if i not in num_cols]
cat_cols

['sex', 'smoker', 'region']

These three columns have categorical attributes. To encode them, we can use LabelEncoder since they don't have ordinal data. Additionally, OneHotEncoder is suitable for such categorical data to transform them into binary vectors.

In [9]:
LE = LabelEncoder()

In [10]:
for i in cat_cols:
    label = LabelEncoder()
    df[i] = label.fit_transform(df[i])

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16.885
1,18,1,33.77,1,0,2,1.726
2,28,1,33.0,3,0,2,4.449
3,33,1,22.705,0,0,1,21.984
4,32,1,28.88,0,0,1,3.867


The encoding of these three columns using LabelEncoder is as follows:

1. "female" = 0, "male" = 1 
    
2. "yes" = 1, "no" = 0 for "smoker or not?"
    
3. "northeast" = 0, "northwest" = 1, "southeast" = 2, "southwest" = 3

In [12]:
y_encoded = df['charges']
X_encoded = df.drop('charges', axis=1)

In [13]:
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.fit_transform(X_test_encoded)

In [15]:
LinReg = LinearRegression()
LinReg.fit(X_train_scaled, y_train_encoded)
y_pred_encoded = LinReg.predict(X_test_scaled)

In [16]:
coefs = np.round(LinReg.coef_, decimals=2)
columns = df.columns[:-1]
intercept = round(LinReg.intercept_, 2)

In [17]:
print('Columns:',list(columns))
print('Coefficients:',list(coefs))
print('Intercept:',intercept)

Columns: ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
Coefficients: [3.69, 0.05, 2.06, 0.51, 9.59, -0.36]
Intercept: 13.38


In [18]:
y_train_pred = LinReg.predict(X_train_scaled)
y_test_pred = LinReg.predict(X_test_scaled)

train_mse = np.round(mean_squared_error(y_train, y_train_pred), decimals = 2)
test_mse = np.round(mean_squared_error(y_test, y_test_pred), decimals = 2)

print("Train set MSE:", train_mse)
print("Test set MSE:", test_mse)

Train set MSE: 37.75
Test set MSE: 33.82


Result:

In both linear regression and mean estimation methods, we have reached a prediction of 13.38. However, in the regression method, we observe that the mean squared error of the test data is much lower (33.82) compared to the mean estimation method (146.75).