### <span style="color:GoldenRod ">**MediBuddy Predictive Model**</span>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#### <span style="color:Aquamarine ">**Load and merge datasets**</span>

In [2]:
df1 = pd.read_excel(r"F:\Internships\Labmentix\Project 2\Python\drive-download-20250126T150528Z-001\Medibuddy personal details.xlsx")
df2 = pd.read_excel(r"F:\Internships\Labmentix\Project 2\Python\drive-download-20250126T150528Z-001\Medibuddy Price data.xlsx")

In [3]:
df = pd.merge(df1, df2, on='Policy no.', how='inner')
df

Unnamed: 0,Policy no.,children,smoker,region,age,sex,bmi,charges in INR
0,PLC157006,0,no,southwest,23,male,34.400,1826.84300
1,PLC157033,1,no,southwest,19,male,24.600,1837.23700
2,PLC157060,0,no,southwest,56,male,40.300,10602.38500
3,PLC157087,1,no,southwest,30,female,32.400,4149.73600
4,PLC157186,5,no,southwest,19,female,28.600,4687.79700
...,...,...,...,...,...,...,...,...
1333,PLC168400,1,yes,northeast,39,male,29.925,22462.04375
1334,PLC168436,0,yes,northeast,18,female,21.660,14283.45940
1335,PLC168634,2,yes,northeast,42,male,24.605,21259.37795
1336,PLC168652,0,yes,northeast,29,female,21.850,16115.30450


#### <span style="color:Aquamarine ">**Data Preprocessing**</span>

In [4]:
df.isnull().sum()

Policy no.        0
children          0
smoker            0
region            0
age               0
sex               0
bmi               0
charges in INR    0
dtype: int64

In [5]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1333    False
1334    False
1335    False
1336    False
1337    False
Length: 1338, dtype: bool

In [6]:
df.shape

(1338, 8)

In [None]:
df.describe()

Unnamed: 0,children,age,bmi,charges in INR
count,1338.0,1338.0,1338.0,1338.0
mean,1.094918,39.207025,30.663397,13270.422265
std,1.205493,14.04996,6.098187,12110.011237
min,0.0,18.0,15.96,1121.8739
25%,0.0,27.0,26.29625,4740.28715
50%,1.0,39.0,30.4,9382.033
75%,2.0,51.0,34.69375,16639.912515
max,5.0,64.0,53.13,63770.42801


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Policy no.      1338 non-null   object 
 1   children        1338 non-null   int64  
 2   smoker          1338 non-null   object 
 3   region          1338 non-null   object 
 4   age             1338 non-null   int64  
 5   sex             1338 non-null   object 
 6   bmi             1338 non-null   float64
 7   charges in INR  1338 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 83.8+ KB


In [9]:
# Separate features and target
X = df.drop(columns=['Policy no.', 'charges in INR'])
y = df['charges in INR']

#### <span style="color:Aquamarine ">**Data Pipeline**</span>

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for Categorical and numerical feature
categorical_features = ['smoker', 'region', 'sex']
numerical_features = ['children', 'age', 'bmi']

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Combine preprocessor and model into a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

#### <span style="color:Aquamarine ">**Train the Model**</span>

In [11]:
# Training the model

model.fit(X_train, y_train)

#### <span style="color:Aquamarine ">**Model Evaluation**</span>

In [12]:
# Model Evaluation 
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

RMSE: 4765.086915037067
MAE: 2601.025481578017
R2: 0.8402077055805866


#### <span style="color:Aquamarine ">**Hyperparameter Tuning**</span>

In [13]:
# Hyperparamete Tuning

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5 ,10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [14]:
best_model

#### <span style="color:Aquamarine ">**Model Validation**</span>

In [15]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = (-cv_scores)**0.5
print(f'Cross Validation RMSE: {cv_rmse.mean()}')

Cross Validation RMSE: 6433.101784879111


#### <span style="color:Aquamarine ">**Final Model**</span>

In [16]:
final_model = best_model.fit(X, y)

In [17]:
final_model.score(X, y)

0.9299168293185374

In [18]:
# import joblib

# joblib.dump(best_model, 'F:\Internships\Labmentix\Project 2\Python\medibuddy_spending_predictor.pkl')

#### <span style="color:Aquamarine ">**Predicting the Insurance Charges**</span>

In [24]:
def predict_insurance(final_model):
    try:
        print("\n 🔍 Enter your details to predict the insurance charges:")

        # Collecting User Input
        age = float(input("Enter age (e.g., 30): "))
        bmi = float(input("Enter BMI (e.g., 23.5): "))
        children = int(input("Enter number of dependents (e.g., 2): "))

        # Input validation for categorical data
        while True:
            sex = input("Enter gender (male/female): ").strip().lower()
            if sex in ['male', 'female']:
                break
            print("⚠️ Please enter 'male' or 'female'.")

        while True:
            smoker = input("Are you a smoker? (yes/no): ").strip().lower()
            if smoker in ['yes', 'no']:
                break
            print("⚠️ Please enter 'yes' or 'no'.")

        valid_regions = ['northeast', 'northwest', 'southeast', 'southwest']
        while True:
            region = input(f"Enter region ({'/'.join(valid_regions)})")
            if region in valid_regions:
                break
            print(f"⚠️ Please enter a valid regiong: {','.join(valid_regions)}")


        # Preparing data for Predictions
        new_data = pd.DataFrame({
            'age': [age],
            'bmi': [bmi],
            'children': [children],
            'sex': [sex],
            'smoker': [smoker],
            'region': [region]
        })

        # Making Predictions
        prediction = final_model.predict(new_data)[0]
        print(f"\n✅ Estimated Insurance Charges: ₹{prediction:,.2f}")
    except ValueError:
        print("❌ Invalid Input. Please enter valid inputs")
    except Exception as e:
        print(f"❌ An unexpected error occured: {e}")

In [25]:
predict_insurance(final_model)


 🔍 Enter your details to predict the insurance charges:

✅ Estimated Insurance Charges: ₹19,822.79
