In [15]:
# dependecies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

PREPROCESSING/EDA

In [16]:
# reading in the data
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
# understanding categorical vs numerical (can also use df.dtypes)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [18]:
# checking null values
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [19]:
# checking duplicated values
data.duplicated().sum()

np.int64(1)

In [20]:
duplicated_rows = data[data.duplicated()]
duplicated_rows

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [21]:
# handling duplicates
data = data.drop_duplicates()

In [22]:
data.shape

(1337, 7)

don't do label encoding which is assigning a number to each feature; it messes up a neural network; changing to one hot encoding which makes dummy variables for each feature (sql distinct comes in clutch here) and then uses binary

In [23]:
ohe_region = pd.get_dummies(data.region)
ohe_sex = pd.get_dummies(data.sex)
ohe_smoker = pd.get_dummies(data.smoker)

In [24]:
# numerical => categorical for "imbalance"/skew purposes; THIS IS CALLED BINNING
data['age_category'] = pd.cut(data['age'], bins=[0, 25, 35, 45, 55, 65], labels=['18-25', '26-35', '36-45', '46-55', '56-65'])
data['bmi_category'] = pd.cut(data['bmi'], bins=[0, 18.5, 25, 30, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
data['charges_category'] = pd.cut(data['charges'], bins=[0, 10000, 20000, 30000, 40000, 50000, 60000, np.inf], labels=['$0-10,000', '$10,001-20,000', '$20,001-30,000', '$30,001-40,000', '$40,001-50,000', '$50,001-60,000', '$60,000+'])

In [25]:
data = pd.concat([data, ohe_region, ohe_sex, ohe_smoker], axis='columns')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_category,bmi_category,charges_category,northeast,northwest,southeast,southwest,female,male,no,yes
0,19,female,27.9,0,yes,southwest,16884.924,18-25,Overweight,"$10,001-20,000",False,False,False,True,True,False,False,True
1,18,male,33.77,1,no,southeast,1725.5523,18-25,Obese,"$0-10,000",False,False,True,False,False,True,True,False
2,28,male,33.0,3,no,southeast,4449.462,26-35,Obese,"$0-10,000",False,False,True,False,False,True,True,False
3,33,male,22.705,0,no,northwest,21984.47061,26-35,Normal,"$20,001-30,000",False,True,False,False,False,True,True,False
4,32,male,28.88,0,no,northwest,3866.8552,26-35,Overweight,"$0-10,000",False,True,False,False,False,True,True,False


In [26]:
data.to_csv('cleaned_data.csv', index=False)

BEGINNING THE VISUALIZATIONS

In [27]:
# for all my plots
palette = [ '#C66F80', '#F4C7D0','#FCEBF1', '#4A6644', '#9FAA74', '#D7DAB3', '#ECE3D2']
customCmap = ListedColormap(palette)

In [28]:
corr = data.corr()

plt.figure(figsize=[12, 8])
sns.heatmap(corr, annot=True, fmt='.4f', cmap=customCmap)
plt.title('Correlation Matrix of Variables')
plt.show()


ValueError: could not convert string to float: 'female'

nothing really of note; strong negative correlation between smoking & insurance charges

In [None]:
# histograms
data.hist(bins=15, figsize=[15,10])
plt.show()

age, children, and charges are skewed to the right. bmi is reminscent of a normal curve. There is evidence of imbalance in age, children, and smoker features.

In [None]:
# due to charge feature skew, most outliers are on the higher end. 
sns.boxplot(y='charges', x='region', hue='smoker', data=data)

multiple experiments with boxplots; can make numerous conclusions about them. 

In [None]:
# pairplot video experimentation
sns.pairplot(data, kind='reg', palette=palette)

- detecting numerical features
- histograms on the diagonals; scatter plots everywhere else; can be changed
- use hue for categorical features
- boolean types are treated as numeric
- can specify which exact variables you want to see & which axis they're on 

In [None]:
sns.pairplot(data, hue='smoker', palette=palette, vars=['charges', 'bmi'])

In [None]:
# to understand a misconception
sns.kdeplot(data, x='age', hue='smoker', palette=palette)

learned quite a lot from that; despite the normal-looking curve for bmi, the results showed heavy skew for overweight & obese individuals

In [None]:
sns.boxplot(y='charges', x='sex', hue='bmi_category', data=data)

In [None]:
data.head()

In [None]:
# let's create a contingency table to see sex & bmi broken down
crosstab01 = pd.crosstab(data['sex'], data['bmi_category'])
crosstab01

was checking a duplicate issue; there's an extra row somewhere; i don't care to find it

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
crosstab01.plot(kind='bar', stacked = True)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
# checking feature importance (i think??); seeing if i can see any distinct noticeable patterns
columns = ['age_category', 'sex', 'bmi_category', 'smoker', 'children', 'region']

for col in columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=col, hue='charges_category', data=data, palette=palette)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title('Insurance Charges', loc='right')
    plt.show()

In [None]:
# interesting but knew from sql queries as well as really only helpful for visualizations & classification problems
data['charges_category'].value_counts().plot(kind='bar')

In [None]:
# SO SKEWED
data['charges_category'].value_counts()

In [None]:
# checking feature importance (i think??); seeing if i can see any distinct noticeable patterns
columns = ['age_category', 'sex', 'bmi_category', 'smoker', 'children', 'region']

for col in columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=col, hue='charges_category', data=data, palette=palette)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title('Insurance Charges', loc='right')
    plt.show()

In [None]:
# finding average features for each charge category
numeric_columns = ['age', 'children', 'bmi']

for col in numeric_columns:
    mean_values = data.groupby('charges_category')[col].mean()
    plt.figure(figsize=(12, 6))
    mean_values.plot(kind='bar', color=customCmap.colors)
    plt.title(f'Average {col.capitalize()} per Insurance Charge Category')
    plt.xlabel('Charge Category')
    plt.xticks(rotation=0)
    plt.ylabel(col.capitalize())
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

MODELING