In [1]:
# dependecies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scikeras.wrappers import KerasClassifier

: 

: 

In [None]:
# reading in the data
data = pd.read_csv('insurance.csv')
data.head()

In [None]:
# understanding categorical vs numerical (can also use df.dtypes)
data.info()

In [None]:
# checking null values
data.isnull().sum()

In [None]:
# checking duplicated values
data.duplicated().sum()

In [None]:
duplicated_rows = data[data.duplicated()]
duplicated_rows

In [None]:
# handling duplicates
data = data.drop_duplicates()

In [None]:
data.info()

In [None]:
data.duplicated().sum()

In [None]:
data.info()

In [None]:
# categorical => numerical
data['sex_numeric'] = data['sex']

dict_sex = {'sex_numeric': {'female': 1, 'male': 2}}
data.replace(dict_sex, inplace = True)

data['region_numeric'] = data['region']

dict_region = {'region_numeric': {'northeast': 1, 'northwest': 2, 'southeast': 3, 'southwest': 4}}
data.replace(dict_region, inplace = True)

data['smoker_numeric'] = data['smoker']

dict_smoker = {'smoker_numeric': {'yes': 1, 'no': 2}}
data.replace(dict_smoker, inplace = True)

In [None]:
# keeping datasets separate for utilization of both
data_numeric = data.copy()
data_numeric.drop(['sex', 'region', 'smoker'], inplace = True, axis = 1)
data.drop(['sex_numeric', 'region_numeric', 'smoker_numeric'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data_numeric.head()

In [None]:
data_numeric.info()

In [None]:
# get means, standard deviations, min, max, etc.
data_numeric.describe()

BEGINNING THE VISUALIZATIONS

In [None]:
corr = data_numeric.corr()

plt.figure(figsize=[12, 8])
sns.heatmap(corr, annot=True, fmt='.4f')
plt.title('Correlation Matrix of Variables')
plt.show()


nothing really of note; strong negative correlation between smoking & insurance charges

In [None]:
# histograms
data_numeric.hist(bins=15, figsize=[15,10])
plt.show()

age, children, and charges are skewed to the right. bmi is reminscent of a normal curve. There is evidence of imbalance in age, children, and smoker features.

In [None]:
# get mean, standard deviation, basic statistics
data_numeric.describe()

In [None]:
# due to charge feature skew, most outliers are on the higher end. 
sns.boxplot(y='charges', x='region_numeric', hue='smoker_numeric', data=data_numeric)

multiple experimenets with boxplots; can make numerous conclusions about them. 

In [None]:
# pairplot video experimentation
sns.pairplot(data, kind='reg')

- detecting numerical features
- histograms on the diagonals; scatter plots everywhere else; can be changed
- use hue for categorical features
- boolean types are treated as numeric
- can specify which exact variables you want to see & which axis they're on 

In [None]:
palette = [ '#C66F80', '#F4C7D0', 'HFCEBF1', '#4A6644', '#9FAA74', '#D7DAB3', '#ECE3D2' ]
sns.pairplot(data, hue='smoker', palette='plasma', vars=['charges', 'bmi'])

In [None]:
# to understand a misconception
sns.kdeplot(data, x='age', hue='smoker', palette=palette)

In [None]:
# numerical => categorical for "imbalance"/skew purposes
data['age_category'] = pd.cut(data['age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
data['bmi_category'] = pd.cut(data['bmi'], bins=[0, 18.5, 25, 30, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
data['charges_category'] = pd.cut(data['charges'], bins=[0, 10000, 20000, 30000, 40000, 50000, 60000, np.inf], labels=['$0-10,000', '$10,001-20,000', '$20,001-30,000', '$30,001-40,000', '$40,001-50,000', '$50,001-60,000', '$60,000+'])

In [None]:
data.head()

In [None]:
data.to_csv('data_categorical.csv', index=False)

In [None]:
data.info()

learned quite a lot from that; despite the normal-looking curve for bmi, the results showed heavy skew for overweight & obese individuals

In [None]:
sns.boxplot(y='charges', x='sex', hue='bmi_category', data=data)

In [None]:
data.head()

In [None]:
# let's create a contingency table to see sex & bmi broken down
crosstab01 = pd.crosstab(data['sex'], data['bmi_category'])
crosstab01

was checking a duplicate issue; there's an extra row somewhere; i don't care to find it

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
crosstab01.plot(kind='bar', stacked = True)

In [None]:
data.head()

In [None]:
# checking feature importance (i think??); seeing if i can see any distinct noticeable patterns
columns = ['age_category', 'sex', 'bmi_category', 'smoker', 'children', 'region']

for col in columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=col, hue='charges_category', data=data, palette=palette)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.title('Insurance Charges', loc='right')
    plt.show()