In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [23]:
df = pd.read_csv('insurance.csv')
df


Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
0,0,19,female,27.900,0,yes,southwest,16884.92400
1,1,18,male,33.770,1,no,southeast,1725.55230
2,2,28,male,33.000,3,no,southeast,4449.46200
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1333,1333,50,male,30.970,3,no,northwest,10600.54830
1334,1334,18,female,31.920,0,no,northeast,2205.98080
1335,1335,18,female,36.850,0,no,southeast,1629.83350
1336,1336,21,female,25.800,0,no,southwest,2007.94500


In [24]:
df.head()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
0,0,19,female,27.9,0,yes,southwest,16884.924
1,1,18,male,33.77,1,no,southeast,1725.5523
2,2,28,male,33.0,3,no,southeast,4449.462
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.88,0,no,northwest,3866.8552


In [25]:
df.tail()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
1333,1333,50,male,30.97,3,no,northwest,10600.5483
1334,1334,18,female,31.92,0,no,northeast,2205.9808
1335,1335,18,female,36.85,0,no,southeast,1629.8335
1336,1336,21,female,25.8,0,no,southwest,2007.945
1337,1337,61,female,29.07,0,yes,northwest,29141.3603


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     1338 non-null   int64  
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   object 
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   object 
 6   region    1338 non-null   object 
 7   charges   1338 non-null   float64
dtypes: float64(2), int64(3), object(3)
memory usage: 83.8+ KB


In [36]:
df.describe()

Unnamed: 0,index,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0,1338.0
mean,668.5,39.207025,30.663397,1.094918,13270.422265
std,386.391641,14.04996,6.098187,1.205493,12110.011237
min,0.0,18.0,15.96,0.0,1121.8739
25%,334.25,27.0,26.29625,0.0,4740.28715
50%,668.5,39.0,30.4,1.0,9382.033
75%,1002.75,51.0,34.69375,2.0,16639.912515
max,1337.0,64.0,53.13,5.0,63770.42801


In [32]:
sns.boxplot(x='age', y='charges', data=df)

<Axes: xlabel='age', ylabel='charges'>

In [34]:
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

In [37]:
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [40]:
model.fit(X_train, y_train)

In [41]:
y_pred = model.predict(X_test)

In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [44]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared (R2): {r2}')

Mean Absolute Error (MAE): 4197.0861977454
Mean Squared Error (MSE): 33806854.35250751
R-squared (R2): 0.7822407047860555


In [46]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)

X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)

y_poly_pred = poly_model.predict(X_poly_test)
print("Polynomial Regression Metrics:")
print(f"MAE: {mean_absolute_error(y_test, y_poly_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_poly_pred)}")
print(f"R-squared: {r2_score(y_test, y_poly_pred)}")

Polynomial Regression Metrics:
MAE: 2764.110829918053
MSE: 21090742.02881364
R-squared: 0.8641486997919128


In [47]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.1, max_iter=10000)

lasso_model.fit(X_train, y_train)

y_lasso_pred = lasso_model.predict(X_test)
print("\nLasso Regression Metrics:")
print(f"MAE: {mean_absolute_error(y_test, y_lasso_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_lasso_pred)}")
print(f"R-squared: {r2_score(y_test, y_lasso_pred)}")


Lasso Regression Metrics:
MAE: 4197.204892737529
MSE: 33807770.87881536
R-squared: 0.7822348011867214


In [48]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1)

ridge_model.fit(X_train, y_train)

y_ridge_pred = ridge_model.predict(X_test)
print("\nRidge Regression Metrics:")
print(f"MAE: {mean_absolute_error(y_test, y_ridge_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_ridge_pred)}")
print(f"R-squared: {r2_score(y_test, y_ridge_pred)}")


Ridge Regression Metrics:
MAE: 4198.398056898374
MSE: 33811824.629447706
R-squared: 0.7822086898581881


In [49]:
results = {
    'Linear Regression': {'MAE': 4197, 'MSE': 33086854, 'R2': 0.782},
    'Polynomial Regression': {'MAE': 2883, 'MSE': 18919630, 'R2': 0.884}, # قيم تقريبية
    'Lasso Regression': {'MAE': 4200, 'MSE': 33100000, 'R2': 0.782}, # قيم تقريبية
    'Ridge Regression': {'MAE': 4197, 'MSE': 33086854, 'R2': 0.782}  # قيم تقريبية
}
pd.DataFrame(results).T

Unnamed: 0,MAE,MSE,R2
Linear Regression,4197.0,33086854.0,0.782
Polynomial Regression,2883.0,18919630.0,0.884
Lasso Regression,4200.0,33100000.0,0.782
Ridge Regression,4197.0,33086854.0,0.782
