In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/insurance.csv')

# Display first few rows
df.head()

In [None]:
# Import
from sklearn.preprocessing import LabelEncoder

# Create a copy to avoid modifying original
df_encoded = df.copy()

# Label encode 'sex' and 'smoker'
le = LabelEncoder()
df_encoded['sex'] = le.fit_transform(df_encoded['sex'])     # male=1, female=0
df_encoded['smoker'] = le.fit_transform(df_encoded['smoker'])  # yes=1, no=0

# One-hot encode 'region'
df_encoded = pd.get_dummies(df_encoded, columns=['region'], drop_first=True)

# Display the first few rows
df_encoded.head()

In [None]:
# --- Correlation Matrix ---
plt.figure(figsize=(10, 8))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and Target (y)
X = df_encoded.drop('charges', axis=1)  # all columns except 'charges'
y = df_encoded['charges']               # target variable

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1]:
from sklearn.linear_model import LinearRegression

# Create a model instance
model = LinearRegression()

In [None]:
# Train the model on training data
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on test data
y_pred = model.predict(X_test)

# Evaluation metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the results
print(f"R² Score: {r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# 1. Initialize the model (with random_state for reproducibility)
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# 2. Fit the model
rf.fit(X_train, y_train)

# 3. Predict on test set
y_pred_rf = rf.predict(X_test)

# 4. Evaluate
r2 = r2_score(y_test, y_pred_rf)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest - R² Score:", r2)
print("Random Forest - RMSE:", rmse)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Create the model
dt_model = DecisionTreeRegressor(max_depth=4, random_state=42)

# Fit the model
dt_model.fit(X_train, y_train)

# Predict
y_pred_dt = dt_model.predict(X_test)

# Evaluate
r2_dt = r2_score(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

# 📊 Decision Tree Performance
print("R² Score:", r2_dt)
print("RMSE:", rmse_dt)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Create the model
xgb = XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)

# Fit the model
xgb.fit(X_train, y_train)

# Predict
y_pred = xgb.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# print("📊 XGBoost Performance:")
print("R² Score:", r2)
print("RMSE:", rmse)

In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'insurance_model.pkl')
print("Model saved successfully!")

In [None]:
import pandas as pd

# Define the input values
age = 36
sex = 0  # female
bmi = 30.020
children = 0
smoker = 0  # No
region_northwest = 1
region_southeast = 0
region_southwest = 0
# Actual Charge: 5272.17580
# Create a DataFrame with proper feature names
input_df = pd.DataFrame([{
    'age': age,
    'sex': sex,
    'bmi': bmi,
    'children': children,
    'smoker': smoker,
    'region_northwest': region_northwest,
    'region_southeast': region_southeast,
    'region_southwest': region_southwest
}])

In [None]:
# Predict using the trained model
prediction = model.predict(input_df)
print("Predicted Insurance Cost by Linear Regressor:", prediction[0])

In [None]:
prediction = xgb.predict(input_df)
print("Predicted Insurance Cost by XGBoost:", prediction[0])