In [1]:
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pyarrow
import altair as alt

# Import Data

In [2]:
df = pl.read_csv("insurance.csv")

df.head(50)

age,sex,bmi,children,smoker,region,charges
f64,str,f64,f64,str,str,str
19.0,"""female""",27.9,0.0,"""yes""","""southwest""","""16884.924"""
18.0,"""male""",33.77,1.0,"""no""","""Southeast""","""1725.5523"""
28.0,"""male""",33.0,3.0,"""no""","""southeast""","""$4449.462"""
33.0,"""male""",22.705,0.0,"""no""","""northwest""","""$21984.47061"""
32.0,"""male""",28.88,0.0,"""no""","""northwest""","""$3866.8552"""
…,…,…,…,…,…,…
55.0,"""male""",37.3,0.0,"""no""","""Southwest""","""20630.28351"""
18.0,"""female""",38.665,2.0,"""no""","""northeast""","""3393.35635"""
28.0,"""F""",34.77,0.0,"""no""","""Northwest""","""3556.9223"""
60.0,"""female""",24.53,0.0,"""no""","""Southeast""","""12629.8967"""


In [3]:
df.glimpse()

Rows: 1338
Columns: 7
$ age      <f64> 19.0, 18.0, 28.0, 33.0, 32.0, -31.0, 46.0, 37.0, 37.0, 60.0
$ sex      <str> 'female', 'male', 'male', 'male', 'male', 'woman', 'woman', 'female', 'male', 'female'
$ bmi      <f64> 27.9, 33.77, 33.0, 22.705, 28.88, 25.74, 33.44, 27.74, 29.83, 25.84
$ children <f64> 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0
$ smoker   <str> 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'
$ region   <str> 'southwest', 'Southeast', 'southeast', 'northwest', 'northwest', 'southeast', 'southeast', 'Northwest', 'Northeast', 'northwest'
$ charges  <str> '16884.924', '1725.5523', '$4449.462', '$21984.47061', '$3866.8552', '$3756.6216', '8240.5896', '7281.5056', '$6406.4107', '28923.13692'



In [4]:
df.describe()

statistic,age,sex,bmi,children,smoker,region,charges
str,f64,str,f64,f64,str,str,str
"""count""",1272.0,"""1272""",1272.0,1272.0,"""1272""","""1272""","""1284"""
"""null_count""",66.0,"""66""",66.0,66.0,"""66""","""66""","""54"""
"""mean""",35.214623,,30.56055,0.948899,,,
"""std""",22.478251,,6.095573,1.303532,,,
"""min""",-64.0,"""F""",15.96,-4.0,"""no""","""Northeast""","""$10065.413"""
"""25%""",25.0,,26.18,0.0,,,
"""50%""",38.0,,30.21,1.0,,,
"""75%""",51.0,,34.485,2.0,,,
"""max""",64.0,"""woman""",53.13,5.0,"""yes""","""southwest""","""9991.03765"""


In [5]:
df.schema

Schema([('age', Float64),
        ('sex', String),
        ('bmi', Float64),
        ('children', Float64),
        ('smoker', String),
        ('region', String),
        ('charges', String)])

# Data Cleaning

## Check null values

In [6]:
df.null_count()

age,sex,bmi,children,smoker,region,charges
u32,u32,u32,u32,u32,u32,u32
66,66,66,66,66,66,54


## Drop null values

In [7]:
df_nonull = df.drop_nulls()

In [8]:
df_nonull.describe()

statistic,age,sex,bmi,children,smoker,region,charges
str,f64,str,f64,f64,str,str,str
"""count""",1208.0,"""1208""",1208.0,1208.0,"""1208""","""1208""","""1208"""
"""null_count""",0.0,"""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",35.35596,,30.574971,0.942881,,,
"""std""",22.061241,,6.117562,1.311809,,,
"""min""",-64.0,"""F""",15.96,-4.0,"""no""","""Northeast""","""$10065.413"""
"""25%""",25.0,,26.2,-0.0,,,
"""50%""",38.0,,30.25,1.0,,,
"""75%""",51.0,,34.58,2.0,,,
"""max""",64.0,"""woman""",53.13,5.0,"""yes""","""southwest""","""9991.03765"""


## Check unique values

In [9]:
num_columns = df_nonull["age", "bmi", "charges"]

for num_column in num_columns:
    print(num_column.unique_counts())

shape: (80,)
Series: 'age' [u32]
[
	60
	61
	23
	23
	23
	…
	1
	2
	1
	1
	1
]
shape: (528,)
Series: 'bmi' [u32]
[
	1
	2
	3
	3
	8
	…
	1
	1
	1
	1
	1
]
shape: (1_207,)
Series: 'charges' [u32]
[
	1
	1
	1
	1
	1
	…
	1
	1
	1
	1
	1
]


In [10]:
cat_columns = df_nonull["sex", "children", "smoker", "region"]

for cat_column in cat_columns:
    print(cat_column.unique())

shape: (6,)
Series: 'sex' [str]
[
	"male"
	"female"
	"F"
	"man"
	"M"
	"woman"
]
shape: (10,)
Series: 'children' [f64]
[
	-4.0
	-3.0
	-2.0
	-1.0
	0.0
	1.0
	2.0
	3.0
	4.0
	5.0
]
shape: (2,)
Series: 'smoker' [str]
[
	"no"
	"yes"
]
shape: (8,)
Series: 'region' [str]
[
	"southeast"
	"Northeast"
	"Northwest"
	"southwest"
	"northeast"
	"northwest"
	"Southwest"
	"Southeast"
]


## Clean age values so it doesn't contain negative value

In [11]:
df_clean_age = df_nonull.with_columns(pl.col("age").abs())

## Clean sex values to only male and female

In [12]:
df_clean_sex = df_clean_age.with_columns(
    pl.when(pl.col("sex").str.strip_chars().is_in(["F", "woman"]))
    .then(pl.lit("female"))
    .when(pl.col("sex").str.strip_chars().is_in(["M", "man"]))
    .then(pl.lit("male"))
    .otherwise(pl.col("sex"))
    .alias("sex")
    .cast(pl.Enum(["male", "female"]))
)

## Clean bmi values so it only contain 1 number after comma

In [13]:
df_clean_bmi = df_clean_sex.with_columns(pl.col("bmi").round(1))

## Clean children values so it doesn't contain negative value

In [14]:
df_clean_children = df_clean_bmi.with_columns(pl.col("children").abs().cast(pl.Enum(["0", "1", "2", "3", "4", "5"])))

In [15]:
df_clean_smoker = df_clean_children.with_columns(pl.col("smoker").cast(pl.Enum(["yes", "no"])))

## Clean region values to only (southwest, southeast, northwest, northeast) values

In [16]:
df_clean_region = df_clean_smoker.with_columns(pl.col("region").str.to_lowercase()
    .cast(pl.Enum(["southwest", "southeast", "northwest", "northeast"])))

## Clean charges values to remove $ symbol

In [17]:
df_clean_charges = df_clean_region.with_columns(
    pl.col("charges").str.replace_all("$", "", literal=True).cast(pl.Float64))

In [18]:
df_clean_charges

age,sex,bmi,children,smoker,region,charges
f64,enum,f64,enum,enum,enum,f64
19.0,"""female""",27.9,"""0""","""yes""","""southwest""",16884.924
18.0,"""male""",33.8,"""1""","""no""","""southeast""",1725.5523
28.0,"""male""",33.0,"""3""","""no""","""southeast""",4449.462
33.0,"""male""",22.7,"""0""","""no""","""northwest""",21984.47061
32.0,"""male""",28.9,"""0""","""no""","""northwest""",3866.8552
…,…,…,…,…,…,…
50.0,"""male""",31.0,"""3""","""no""","""northwest""",10600.5483
18.0,"""female""",31.9,"""0""","""no""","""northeast""",2205.9808
18.0,"""female""",36.8,"""0""","""no""","""southeast""",1629.8335
21.0,"""female""",25.8,"""0""","""no""","""southwest""",2007.945


## Filter df with no null values in charges column

In [19]:
df_clean_charges2 = df_clean_charges.filter(
    pl.col("charges").is_not_nan()
)

In [20]:
numeric_cols = df_clean_charges2.select(cs.numeric()).columns

is_not_outlier_exprs = []
for col_out in numeric_cols:
    q1 = pl.col(col_out).quantile(0.25)
    q3 = pl.col(col_out).quantile(0.75)
    iqr = q3 - q1
    low_fence = q1 - 1.5 * iqr
    high_fence = q3 + 1.5 * iqr
    outliers = df_clean_charges2.filter( (pl.col(col_out) < low_fence) | (pl.col(col_out) > high_fence) )
    is_not_outlier_exprs.append(pl.col(col_out).is_between(low_fence, high_fence))

df_cleaned = df_clean_charges2.filter(
    pl.all_horizontal(is_not_outlier_exprs)
)
print(f"Outliers: {outliers} \n")
print(df_cleaned)

Outliers: shape: (129, 7)
┌──────┬────────┬──────┬──────────┬────────┬───────────┬─────────────┐
│ age  ┆ sex    ┆ bmi  ┆ children ┆ smoker ┆ region    ┆ charges     │
│ ---  ┆ ---    ┆ ---  ┆ ---      ┆ ---    ┆ ---       ┆ ---         │
│ f64  ┆ enum   ┆ f64  ┆ enum     ┆ enum   ┆ enum      ┆ f64         │
╞══════╪════════╪══════╪══════════╪════════╪═══════════╪═════════════╡
│ 27.0 ┆ male   ┆ 42.1 ┆ 0        ┆ yes    ┆ southeast ┆ 39611.7577  │
│ 30.0 ┆ male   ┆ 35.3 ┆ 0        ┆ yes    ┆ southwest ┆ 36837.467   │
│ 31.0 ┆ male   ┆ 36.3 ┆ 2        ┆ yes    ┆ southwest ┆ 38711.0     │
│ 22.0 ┆ male   ┆ 35.6 ┆ 0        ┆ yes    ┆ southwest ┆ 35585.576   │
│ 28.0 ┆ male   ┆ 36.4 ┆ 1        ┆ yes    ┆ southwest ┆ 51194.55914 │
│ …    ┆ …      ┆ …    ┆ …        ┆ …      ┆ …         ┆ …           │
│ 45.0 ┆ male   ┆ 30.4 ┆ 0        ┆ yes    ┆ southeast ┆ 62592.87309 │
│ 62.0 ┆ male   ┆ 30.9 ┆ 3        ┆ yes    ┆ northwest ┆ 46718.16325 │
│ 43.0 ┆ male   ┆ 27.8 ┆ 0        ┆ yes    ┆ southw

In [21]:
df_cleaned.describe()

statistic,age,sex,bmi,children,smoker,region,charges
str,f64,str,f64,str,str,str,f64
"""count""",1072.0,"""1072""",1072.0,"""1072""","""1072""","""1072""",1072.0
"""null_count""",0.0,"""0""",0.0,"""0""","""0""","""0""",0.0
"""mean""",39.017724,,29.897575,,,,9892.509093
"""std""",14.055069,,5.903647,,,,7115.551374
"""min""",18.0,,16.0,,,,1121.8739
"""25%""",26.0,,25.7,,,,4415.1588
"""50%""",39.0,,29.6,,,,8515.7587
"""75%""",51.0,,33.7,,,,12979.358
"""max""",64.0,,46.8,,,,34303.1672


#Exploratory Data Analysis (EDA)

In [22]:
num_cols = df_cleaned.select(cs.numeric()).columns
cat_cols = df_cleaned.select(cs.enum()).columns

## Univariate Analysis

### Numerical Columns

In [23]:
df_long = df_cleaned.unpivot(
    on=num_cols,
    variable_name="column",
    value_name="value"
)

boxplot = alt.Chart(df_long).mark_boxplot().encode(
    y=alt.Y('value:Q', title=""),
    column=alt.Column('column:N', title="Boxplot"),
    tooltip=alt.Tooltip('value:Q')
).properties(
    width=200,
    height=300
).resolve_scale(
    y='independent'
)

boxplot

In [24]:

num_col_histogram = []
for col_name in num_cols:
    num_chart = alt.Chart(df_cleaned).mark_bar().encode(
        x=alt.X(col_name, type='quantitative', bin=alt.Bin(maxbins=20), title=col_name),
        y=alt.Y('count()', title='Count'),
        tooltip=[
            alt.Tooltip('count()', title='Total Count'),
            alt.Tooltip(col_name, type='quantitative', bin=True, title='Value Range')
        ]

    ).properties(
        width=250,
        height=300
    )
    num_col_histogram.append(num_chart)

combined_chart = alt.hconcat(*num_col_histogram)

combined_chart

### Categorical Columns

In [25]:
#
cat_histogram = []
for cat_name in cat_cols:
    cat_chart = alt.Chart(df_cleaned).mark_bar().encode(
        x=alt.X(cat_name, type='nominal', title=cat_name, sort='-y'),
        y=alt.Y('count()', title='Count'),
        tooltip=[
            alt.Tooltip(cat_name, type='nominal'),
            alt.Tooltip('count()', title='Count')
        ]
    ).properties(
        width=200,
        height=250
    )
    cat_histogram.append(cat_chart)

combined_cat_hist = alt.hconcat(*cat_histogram)

combined_cat_hist

## Multivariate Analysis

### Pairplot

In [26]:
base_chart = alt.Chart(df_cleaned).mark_point().encode(
    x=alt.X(alt.repeat("column"), type='quantitative'),
    y=alt.Y(alt.repeat("row"), type='quantitative'),
    tooltip=[alt.Tooltip(col) for col in num_cols]
).properties(
    width=200,
    height=200
)

scatter_matrix = base_chart.repeat(
    row=num_cols,
    column=num_cols
)

diagonal_histograms = alt.Chart(df_cleaned).mark_bar().encode(
    x=alt.X(alt.repeat("row"), type='quantitative', bin=True),
    y='count()'
).properties(
    width=200,
    height=200
).repeat(
    row=num_cols
)

pair_plot = scatter_matrix | diagonal_histograms

pair_plot

### Heatmap Correlation

In [27]:
corr_df_wide = df_cleaned.select(num_cols).corr()

corr_df_with_names = corr_df_wide.with_columns(
    pl.Series(name="variable_1", values=corr_df_wide.columns)
)

corr_df_long = corr_df_with_names.unpivot(
    index="variable_1",
    variable_name="variable_2",
    value_name="correlation"
)

heatmap = alt.Chart(corr_df_long).mark_rect().encode(
    x=alt.X('variable_1:N', title=None, sort=num_cols),
    y=alt.Y('variable_2:N', title=None, sort=num_cols),
    color=alt.Color('correlation:Q', scale=alt.Scale(scheme='redblue', domain=(-1, 1)))
)

text_labels = heatmap.mark_text(size=12).encode(
    text=alt.Text('correlation:Q', format='.2f'),
    color=alt.condition(
        abs(alt.datum.correlation) > 0.5,
        alt.value('white'),
        alt.value('black')
    )
)

correlation_heatmap = (heatmap + text_labels).properties(
    width=400,
    height=400,
    title="Correlation Matrix of Numeric Features"
)

correlation_heatmap

# Data Preprocessing

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def create_and_evaluate_regression_model_pl(df_cleaned):
    """
    Prepares data from a Polars DF, fits a linear regression model, and evaluates it.

    Parameters:
    - insurance: polars.DataFrame, the cleaned insurance dataset.

    Returns:
    - A tuple containing the fitted sklearn Pipeline object, mean MSE, and mean R2 scores.
    """
    
    X = df_cleaned.drop('charges')
    y = df_cleaned.select('charges').to_series()

    categorical_features = ['sex', 'smoker', 'region']
    numerical_features = ['age', 'bmi', 'children']

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    mse_scores = -cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='r2')
    mean_mse = np.mean(mse_scores)
    mean_r2 = np.mean(r2_scores)

    model_pipeline.fit(X, y)

    return model_pipeline, mean_mse, mean_r2

In [29]:
fitted_model, mse, r2 = create_and_evaluate_regression_model_pl(df_cleaned)
print(f"Mean MSE: {mse}")
print(f"Mean R-squared: {r2}")

Mean MSE: 20694593.917985547
Mean R-squared: 0.5904672941934862


In [30]:
validation_data = pl.read_csv("validation_dataset.csv")

validation_predictions = fitted_model.predict(validation_data)

validation_data_with_predictions = validation_data.with_columns(
    predicted_charges = pl.Series(validation_predictions)
).with_columns(
     pl.col("predicted_charges").clip(lower_bound=1000).alias("predicted_charges")
)

print(validation_data_with_predictions)

shape: (50, 7)
┌──────┬────────┬───────────┬──────────┬────────┬───────────┬───────────────────┐
│ age  ┆ sex    ┆ bmi       ┆ children ┆ smoker ┆ region    ┆ predicted_charges │
│ ---  ┆ ---    ┆ ---       ┆ ---      ┆ ---    ┆ ---       ┆ ---               │
│ f64  ┆ str    ┆ f64       ┆ f64      ┆ str    ┆ str       ┆ f64               │
╞══════╪════════╪═══════════╪══════════╪════════╪═══════════╪═══════════════════╡
│ 18.0 ┆ female ┆ 24.09     ┆ 1.0      ┆ no     ┆ southeast ┆ 2756.149113       │
│ 39.0 ┆ male   ┆ 26.41     ┆ 0.0      ┆ yes    ┆ northeast ┆ 22359.792171      │
│ 27.0 ┆ male   ┆ 29.15     ┆ 0.0      ┆ yes    ┆ southeast ┆ 18734.175622      │
│ 71.0 ┆ male   ┆ 65.502135 ┆ 13.0     ┆ yes    ┆ southeast ┆ 37235.958991      │
│ 28.0 ┆ male   ┆ 38.06     ┆ 0.0      ┆ no     ┆ southeast ┆ 5188.958414       │
│ …    ┆ …      ┆ …         ┆ …        ┆ …      ┆ …         ┆ …                 │
│ 58.0 ┆ male   ┆ 36.955    ┆ 2.0      ┆ yes    ┆ northwest ┆ 28175.380254      │
│

## Streamlit App Preparation

In [31]:
import joblib

joblib.dump(fitted_model, 'insurance_charge_model.pkl')
print("Model saved successfully!")

Model saved successfully!
