In [None]:
import pandas as pd

# insurance.csv is the original ||file from which the data is read
medical_ins_df = pd.read_csv('./medical.csv')
medical_ins_df.info()         # we get the datatype of each of the column
medical_ins_df.describe()     # we get numerical analysis of the


In [None]:
# importing required libraries for plotting
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline


# setting basic graph colors and properties
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#000000'


In [None]:
# giving the complete analysis of a column in the dataset along with it's graph

age_fig = px.histogram(medical_ins_df,x='age',marginal='box',nbins=47,title='Distribution of age')
age_fig.update_layout(bargap=0.1)
age_fig.show()

medical_ins_df.age.describe()

In [None]:
bmi_fig = px.histogram(medical_ins_df,x='bmi',marginal='box',color_discrete_sequence=['red'],title='Distribution of BMI')
bmi_fig.update_layout(bargap=0.1)
bmi_fig.show()

medical_ins_df.bmi.describe()

In [None]:
# nbins for children = 6
children_fig = px.histogram(medical_ins_df,x='children',marginal='box',color_discrete_sequence=['black'],nbins=6,title='Number of children')
children_fig.update_layout(bargap=0.1)
children_fig.show()

medical_ins_df.children.describe()

In [None]:
charges_fig = px.histogram(medical_ins_df,x='charges',marginal='box',color_discrete_sequence=['orange'],title='Charges paid')
charges_fig.update_layout(bargap=0.1)
charges_fig.show()

medical_ins_df.charges.describe()

In [None]:
# analysing string type columns
medical_ins_df.smoker.value_counts()    # get the equivalent integer number defined by this function

In [None]:
px.histogram(medical_ins_df,x='smoker',color='sex',title='Smoker')

In [None]:
# age and charges relation using scatterplot through charges

age_charge_rel = px.scatter(medical_ins_df,x='age',y='charges',color='smoker',opacity=0.8,hover_data=['sex'],title='Age vs Charges relation')
age_charge_rel.update_traces(marker_size=5)
age_charge_rel.show()

In [None]:
bmi_charge_rel = px.scatter(medical_ins_df,x='bmi',y='charges',color='smoker',hover_data='sex')
bmi_charge_rel.update_traces(marker_size=5)
bmi_charge_rel.show()

In [None]:
# # for any further analysis, we have to convert the strings into numerical datatypes

# smoker_val = {'yes':1,'no':0}   # assign them from 0
# medical_ins_df['smoker'] = medical_ins_df['smoker'].map(smoker_val)

# medical_ins_df['sex'] = medical_ins_df.sex.map({
#     'female':1,
#     'male':0
# })

# region_val = {
#     'southwest':0,
#     'southeast':1,
#     'northwest':2,
#     'northeast':3
# }
# medical_ins_df.region = medical_ins_df.region.map(region_val)

# # medical_ins_df.charges.corr(medical_ins_df.age)

In [None]:
# correlation coefficients between two numerical datatypes
medical_ins_df.charges.corr(medical_ins_df.age)
medical_ins_df.charges.corr(medical_ins_df.bmi)


In [None]:
# import numpy as np

# sns.heatmap(medical_ins_df.corr(), cmap='Reds', annot=True)
# plt.title('Correlation Matrix')

# medical_ins_df.corr()

In [None]:
# linear regression
non_smoker_df = medical_ins_df[medical_ins_df.smoker=='no']

fig = px.scatter(non_smoker_df,x='age',y='charges',title='Age vs Charges');
fig.show()

In [None]:
# charges = w*(x axis age  + b )  --> intercept
# y = w* ( x + b )   general formula for linear regression model

def estimate_charges(age,w,b):
    # Helper function
    return w*age + b

w=50
b=100

ages = non_smoker_df.age    # get all the ages of non smokers

estimated_charges = estimate_charges(ages,w,b)

line_fig = px.line(non_smoker_df,x=ages,y=estimated_charges,title="Age vs Estimated Charges");
line_fig.show()

In [None]:
# checking if the model fits with the current data
targets = non_smoker_df.charges

# plt.plot(ages, estimated_charges, 'r', alpha=0.9);
# plt.scatter(ages, targets, s=8,alpha=0.8);
# plt.xlabel('Age');
# plt.ylabel('Charges')
# plt.legend(['Estimate', 'Actual']);

# target_rel_fig = px.scatter(non_smoker_df,x=ages,y=targets,opacity=0.5,title='Distribution of ages and target charges')
# target_rel_fig.update(non_smoker_df,x=ages,y=estimated_charges)
# target_rel_fig.show()

target = non_smoker_df.charges

plt.plot(ages, estimated_charges, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate', 'Actual']);


In [None]:
def try_parameter(w,b):
    ages = non_smoker_df.age

    target = non_smoker_df.charges

    estimated_charges = estimate_charges(ages,w,b)

    plt.plot(ages,estimated_charges,'r',alpha=0.9);
    plt.scatter(ages,target,alpha=0.8);
    plt.xlabel('Ages');
    plt.ylabel('Charges');
    plt.legend(['Estimate','Actual']);

try_parameter(400,5000);


In [None]:
import numpy as np

def rmse(targets,predictions):
    return np.sqrt(np.mean(np.square(targets-predictions)));

w=1500
b=500
