## Libraries

In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import plotly.express as px


## Linear Regeression 


Linear regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables by fitting a linear equation to observed data. The central premise is to determine a linear line that best predicts the dependent variable based on the independent variables. This line is represented by the equation Y = a + bX + ε, where Y is the dependent variable, X is the independent variable, a is the intercept, b is the slope of the line, and ε is the error term, accounting for the variance in Y not explained by X. In simple linear regression, there's only one independent variable, and the method aims to find the line that minimizes the differences (or errors) between the observed and predicted values of the dependent variable. This technique is widely used in forecasting, risk management, and finding trends from data, making it a fundamental tool in fields ranging from economics to machine learning.



##  1. Load Data

In [111]:
df = pd.read_csv('../data/medical_insurance.csv')
df = pd.DataFrame(df)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Check for Null Values

In [112]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### Visualize Data

In [114]:
# AGE 
px.bar(df.groupby('age').size(), x=df.groupby('age').size().index, y=df.groupby('age').size().values, title='Age distribution')


In [124]:
# Gender 
df_sex = df['sex'].value_counts().reset_index()
df_sex.columns = ['sex', 'count']
px.bar(df_sex, x='sex', y='count', color='sex', title='Gender')

In [125]:
df_region = df['region'].value_counts().reset_index()
df_region.columns = ['region', 'count']
px.bar(df_region, x='region', y='count', color='region', title='Region')

In [107]:
# Define the target variable and explore it 
TARGET = 'charges'
FEATURES = df[['age', 'age', 'bmi', 'children', 'smoker', 'region']]
df = pd.concat([df[TARGET], FEATURES], axis=1)
df[TARGET].describe()

count     2772.000000
mean     13261.369959
std      12151.768945
min       1121.873900
25%       4687.797000
50%       9333.014350
75%      16577.779500
max      63770.428010
Name: charges, dtype: float64

## 2.Preprocessing

### 2.1 Categorical Columns

In [92]:
df_categorical = df.select_dtypes(include=['object'])
df_dummies = pd.get_dummies(df_categorical, drop_first=True)
print(df_dummies.head())

   sex_male  smoker_yes  region_northwest  region_southeast  region_southwest
0     False        True             False             False              True
1      True       False             False              True             False
2      True       False             False              True             False
3      True       False              True             False             False
4      True       False              True             False             False


### 2.2 Numerical Columns

In [93]:
df_numerical = df.select_dtypes(include=['int64', 'float64'])
df_numerical.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [94]:
X_num = df_numerical.drop(TARGET, axis=1)
X_scaler = StandardScaler()
X_num_scaled = X_scaler.fit_transform(X_num.values.reshape(-1, 3))
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns)

y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(df[[TARGET]].values.reshape(-1, 1))
y_scaled = pd.DataFrame(y_scaled, columns=[TARGET])

X = pd.concat([X_num_scaled, df_dummies], axis=1)
y = y_scaled

## 3. Model

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train.astype(float)).fit()

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
y_test_inv = y_scaler.inverse_transform(y_test)

pred_scaled = pd.DataFrame(y_pred, columns=[TARGET])
y_pred_original = y_scaler.inverse_transform(pred_scaled)

mse = mean_squared_error(y_test_inv, y_pred_original)
mae = mean_absolute_error(y_test_inv, y_pred_original)

r2, mse, mae

(0.7398166177564302, 39933194.548051454, 4160.247974763)

In [96]:
model.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,844.2
Date:,"Fri, 22 Mar 2024",Prob (F-statistic):,0.0
Time:,12:49:19,Log-Likelihood:,-1581.4
No. Observations:,2217,AIC:,3181.0
Df Residuals:,2208,BIC:,3232.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.3486,0.025,-14.088,0.000,-0.397,-0.300
age,0.2949,0.011,27.512,0.000,0.274,0.316
bmi,0.1646,0.011,14.829,0.000,0.143,0.186
children,0.0454,0.011,4.300,0.000,0.025,0.066
sex_male,-0.0059,0.021,-0.280,0.779,-0.047,0.036
smoker_yes,1.9655,0.026,75.508,0.000,1.914,2.017
region_northwest,-0.0335,0.030,-1.106,0.269,-0.093,0.026
region_southeast,-0.0947,0.030,-3.107,0.002,-0.155,-0.035
region_southwest,-0.0874,0.030,-2.881,0.004,-0.147,-0.028

0,1,2,3
Omnibus:,479.153,Durbin-Watson:,1.96
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1143.465
Skew:,1.185,Prob(JB):,5.01e-249
Kurtosis:,5.6,Cond. No.,5.69


## 4. Predict on all data

In [99]:
X = sm.add_constant(X)
pred = model.predict(X)
pred_scaled = pd.DataFrame(pred, columns=[TARGET])
y_pred_original = y_scaler.inverse_transform(pred_scaled)
y_pred_original = pd.DataFrame(y_pred_original, columns=[TARGET])

df_new = df.copy()
df_new['pred'] = y_pred_original
df_new

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,pred
0,19,female,27.900,0,yes,southwest,16884.92400,25311.754227
1,18,male,33.770,1,no,southeast,1725.55230,3386.735539
2,28,male,33.000,3,no,southeast,4449.46200,6588.501136
3,33,male,22.705,0,no,northwest,21984.47061,3881.763207
4,32,male,28.880,0,no,northwest,3866.85520,5642.930831
...,...,...,...,...,...,...,...,...
2767,47,female,45.320,1,no,southeast,8569.86180,14608.843715
2768,21,female,34.600,0,no,southwest,2020.17700,4128.235965
2769,19,male,26.030,1,yes,northwest,16450.89470,25738.005372
2770,23,male,18.715,0,no,northwest,21595.38229,34.518292


## Forecasts vs. Predicted

In [102]:
fig = px.scatter(df_new, x=TARGET, y='pred', title='Actual vs Predicted Charges')
fig.add_shape(type='line', line=dict(dash='dash'), x0=df[TARGET].min(), y0=df[TARGET].min(), x1=df[TARGET].max(), y1=df[TARGET].max())
fig.show()


## Residual Plot

In [104]:
df_new['Residuals'] = df_new[TARGET] - df_new['pred']
fig = px.scatter(df_new, x='pred', y='Residuals', title='Residuals vs Predicted Charges')
fig.add_hline(y=0, line_dash="dash")
fig.show()
