## Setup

In [None]:
import sys

sys.path.append("../")

In [None]:
import pandas as pd

from lib.data_loading import read_data
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

## Load data

In [None]:
df = read_data("../data/pg15training.csv")
df.head()

## EDA

### Info

In [None]:
print("Data Overview")
print(df.info())
print("\nSummary Statistics")
print(df.describe())

### Missing values

In [None]:
print("\nMissing Values")
print(df.isnull().sum())

### Distribution of numerical features

In [None]:
numerical_features = ['Age', 'Group1', 'Bonus', 'Poldur', 'Value', 'Adind', 'Density', 'Exppdays', 'Numtppd', 'Numtpbi', 'Indtppd', 'Indtpbi']
for col in numerical_features:
    fig = px.histogram(df, x=col, title=f'Distribution of {col}')
    fig.show()

### Count of each category for categorical features

In [None]:
categorical_features = ['Gender', 'Type', 'Category', 'Occupation', 'SubGroup2', 'Group2']
for col in categorical_features:
    fig = px.histogram(df, x=col, title=f'Count of {col}', color=col)
    fig.show()

### Relationship between Age and Numtppd by Gender and Type

In [None]:
fig = px.scatter(df, x='Age', y='Numtppd', color='Gender', symbol='Type', title="Age vs. Numtppd by Gender and Type")
fig.show()

### Average Numtppd by Group1

In [None]:
fig = px.box(df, x='Group1', y='Numtppd', title="Distribution of Numtppd by Group1")
fig.show()

### Density Distribution by Occupation

In [None]:
fig = px.box(df, x='Occupation', y='Density', title="Density Distribution by Occupation")
fig.show()

### Correlation Heatmap for Numerical Features

In [None]:
corr = df[numerical_features].corr()
fig = go.Figure(data=go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns, colorscale='Viridis'))
fig.update_layout(title="Correlation Heatmap for Numerical Features")
fig.show()

### Relationship between Exposure Days and Number of TPPD by Group2

In [None]:
fig = px.scatter(df, x='Exppdays', y='Numtppd', color='Group2', size='Value', title="Exposure Days vs. Numtppd by Group2")
fig.show()