In [4]:
import numpy as np
import pandas as pd
file_id = '1hczD_xlMIzTDUOqHBGtuYdE4SwhabRtF'
url=f'https://drive.google.com/uc?id={file_id}'
data=pd.read_csv(url) #reads the csv file given using the url provided
data.head() #Prints out the base data stored in the dataframe for the first 5 values

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
#Data PreProcessing
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
numerical_columns=["age","bmi","children"]
non_numerical_columns=["smoker","region","sex"]
#Data preprocessing for numerical data which is used to normalize data by making values centered around the mean with standard deviation=1
numerical_data=StandardScaler()
#Data Preprocessing for non-numerical data which is used to used to create binary columns for each category and treated independently by the model
categorical_data=OneHotEncoder(sparse=False,drop="first")
#Converting data for the data given using transformer to specified subset of columns and then stored into arrays
preproc=ColumnTransformer(transformers=[('numerical',numerical_data,numerical_columns),('categorical',categorical_data,non_numerical_columns)])
preproc_data=preproc.fit_transform(data)  #Fit and transform the preprocessed data into dataframe data



In [8]:
#Data Testing
from sklearn.model_selection import train_test_split
X=data.drop('charges',axis=1)
y=data['charges'] #target variable for predictor model
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)#Used test_size=0.2 for default usage but for larger data set test_size=0.1 is preferred
print("Training features:\n", X_train.head(5))
print("Test features:\n", X_test.head(5))
print("Training target:\n", y_train.head(5))
print("Test target:\n", y_test.head(5))
#Gives overview/sample of how the training and test features are split and testd with respective targets

Training features:
      age     sex     bmi  children smoker     region
29    31    male  36.300         2    yes  southwest
535   38    male  28.025         1     no  northeast
695   26  female  40.185         0     no  northwest
557   34    male  34.210         0     no  southeast
836   36    male  31.500         0     no  southwest
Test features:
      age     sex     bmi  children smoker     region
521   32  female  44.220         0     no  southeast
737   26    male  23.700         2     no  southwest
740   45    male  24.035         2     no  northeast
660   37    male  46.530         3     no  southeast
411   44  female  20.235         1    yes  northeast
Training target:
 29     38711.00000
535     6067.12675
695     3201.24515
557     3935.17990
836     4402.23300
Name: charges, dtype: float64
Test target:
 521     3994.17780
737     3484.33100
740     8604.48365
660     6435.62370
411    19594.80965
Name: charges, dtype: float64


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
#Model Creation
model = RandomForestRegressor(n_estimators=100, random_state=42)
#An alternative of neural netwrok can be used as the provided file has a large dataset but implementing it can be difficult thus used a regression model
#Pipeline Creation and value fitting for prediction
pipeline = Pipeline(steps=[('preprocessor', preproc),('model', model)])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#Model Evaluation using y_pred values to check if the model has a proper functioning or not
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) #mean-squared-error->measures error between actual and predicted values
mae = mean_absolute_error(y_test, y_pred) #mean-absolute error->similar to mse but it gives out an average over the difference b/w actual & predicted values
r2 = r2_score(y_test, y_pred) #r-sqaured coefficient->used to measure variance of target vairable predictable from the independant variable[can be 0/1]
print("Mean Squared Error:",rmse,"\n")
print("Mean Absolute Error:",mae,"\n")
print("R-Squared Coefficient:",r2,"\n")

Mean Squared Error: 4788.98660572483 

Mean Absolute Error: 2480.9344630115743 

R-Squared Coefficient: 0.8825686128456455 



In [11]:
import joblib #This module is used to save the model created for the given dataset adn sued to give out calculated predicted values of given required output
#import pickle->This module too creates a file for us to store the model data but joblib is used as the numerical aray data is larger
joblib.dump(pipeline,'med_expenses.pkl')  #Saves the file into med_expnses.pkl-> pickle file
pipeline = joblib.load('med_expenses.pkl') #Loads the file for making predictions
#New Customer Data to be used for prediction
new_customer = pd.DataFrame({
    'age': [25],
    'sex': ['male'],
    'bmi': [23.2],
    'children': [2],
    'smoker': ['no'],
    'region': ['northwest']
})
#Estimating annual medical expenditure based on the new customer data
estimated_expenditure = pipeline.predict(new_customer)[0]
print(f'Estimated Annual Medical Expenditure: ${estimated_expenditure:.2f}')
#Determining annual premium
premium_factor = 1.2  #Took a sample from online
annual_premium = estimated_expenditure * premium_factor
print(f'Annual Premium: ${annual_premium:.2f}')
#Determining monthly premium
monthly_premium = annual_premium / 12
print(f'Monthly Premium: ${monthly_premium:.2f}')

Estimated Annual Medical Expenditure: $6552.24
Annual Premium: $7862.69
Monthly Premium: $655.22


In [12]:
new_customer2 = pd.DataFrame({
    'age': [41],
    'sex': ['female'],
    'bmi': [26.2],
    'children': [1],
    'smoker': ['no'],
    'region': ['northeast']
})
estimated_expenditure = pipeline.predict(new_customer2)[0]
print(f'Estimated Annual Medical Expenditure: ${estimated_expenditure:.2f}')
premium_factor = 1.2
annual_premium = estimated_expenditure * premium_factor
print(f'Annual Premium: ${annual_premium:.2f}')
monthly_premium = annual_premium / 12
print(f'Monthly Premium: ${monthly_premium:.2f}')

Estimated Annual Medical Expenditure: $6968.54
Annual Premium: $8362.24
Monthly Premium: $696.85
