In [1]:
import numpy as np # linear algebra
import pandas as pd


In [2]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [3]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [4]:
df = pd.get_dummies(df,columns=['sex','region','smoker'] ,dtype='int')
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
0,19,27.9,0,16884.924,1,0,0,0,0,1,0,1
1,18,33.77,1,1725.5523,0,1,0,0,1,0,1,0
2,28,33.0,3,4449.462,0,1,0,0,1,0,1,0
3,33,22.705,0,21984.47061,0,1,0,1,0,0,1,0
4,32,28.88,0,3866.8552,0,1,0,1,0,0,1,0


In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df.iloc[:,0:12] = scaler.fit_transform(df.iloc[:,0:12])

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(df.iloc[:,0:12])

In [7]:
 # Create a mesh grid of input values for the principal components
x1_range = np.linspace(X_pca[:, 0].min(), X_pca[:, 0].max(), 10)
x2_range = np.linspace(X_pca[:, 1].min(), X_pca[:, 1].max(), 10)
x1_mesh, x2_mesh = np.meshgrid(x1_range, x2_range)
X_mesh = np.column_stack((x1_mesh.ravel(), x2_mesh.ravel()))



In [8]:
first_column = df.pop('charges')
df.insert(0, 'charges', first_column)

In [9]:
x = df.iloc[:,1:12]
y = df.iloc[:,0]

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x , y ,test_size=0.3,random_state=2)

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
lr = LinearRegression()

In [13]:
lr.fit(X_train,y_train)

In [14]:
y_pred = lr.predict(X_test)

In [15]:
accuracy_score = lr.score(X_test,y_test)
accuracy_score*100

76.39172148807954

In [16]:
from sklearn.datasets import make_regression

In [17]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [18]:
print("MAE",mean_absolute_error(y_test,y_pred))
print("MSE",mean_squared_error(y_test,y_pred))
print("R2 score",r2_score(y_test,y_pred))

MAE 0.35499079718054793
MSE 0.2604023126466698
R2 score 0.7639172148807953


In [19]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_pca, y)

In [20]:
# Generate predictions over the mesh grid
y_pred_mesh = model.predict(X_mesh).reshape(x1_mesh.shape)

In [21]:
import plotly.graph_objects as go

In [22]:

fig = go.Figure(data=[go.Scatter3d(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    z=y,
    mode='markers',
    marker=dict(size=5, color='blue'),
    name='Original Data (PCA)'
)])

# Add the plane of the regression model
fig.add_trace(go.Surface(
    x=x1_mesh,
    y=x2_mesh,
    z=y_pred_mesh,
    opacity=0.8,
    colorscale='Viridis',
    name='Regression Plane (PCA)'
))

# Set the layout
fig.update_layout(scene=dict(
    xaxis_title='Principal Component 1',
    yaxis_title='Principal Component 2',
    zaxis_title='Y'
))

# Show the plot
fig.show()