# Exploratory data analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('heart_attack_youth_vs_adult.csv', index_col=0)
data

In [None]:
print(data.info())

In [None]:
import pandas as pd

def create_info_table(df):
    # Extract information from the DataFrame
    info_dict = {
        'Column': df.columns,
        'Dtype': df.dtypes
    }
    
    # Create a DataFrame from the extracted information
    info_df = pd.DataFrame(info_dict)
    
    # Reset index for a clean table
    info_df.reset_index(drop=True, inplace=True)
    
    return info_df

# Create and display the info table
info_table = create_info_table(data)
info_table

In [None]:
cols_categorical = data.select_dtypes(include='object').columns
print(len(cols_categorical), cols_categorical)
cols_numeric = data.select_dtypes(include='number').columns
print(len(cols_numeric), cols_numeric)

In [None]:
# Set display options for better readability
pd.options.display.float_format = '{:,.2f}'.format
data.describe()

In [None]:
# Create a pie chart for each categorical column
for column in cols_categorical:
    plt.figure(figsize=(10, 7))
    colors = sns.color_palette('pastel')[0:len(data[column].unique())]
    data[column].value_counts().plot.pie(
        autopct='%1.1f%%', 
        startangle=90, 
        colors=colors, 
        wedgeprops={'edgecolor': 'black'},
        textprops={'fontsize': 12}
    )
    plt.title(f'Pie chart for {column}', fontsize=16)
    plt.ylabel('')  # Hide the y-label
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

## Numerical Distribution

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

PALETTE = sns.color_palette('pastel')

import plotly.express as px
import plotly.graph_objects as go

In [None]:
from palmerpenguins import load_penguins
penguins = load_penguins()
def plot():  
        scatterplot = px.histogram(
            data_frame=penguins,
            x="body_mass_g",
            nbins=30,
        ).update_layout(
            title={"text": "Penguin Mass", "x": 0.5},
            yaxis_title="Count",
            xaxis_title="Body Mass (g)",
        )

        return scatterplot 
plot()

In [None]:
scatterplot = px.histogram(
    data_frame=data,
    x="BMI",
).update_layout(
    title={"text": "BMI", "x": 0.5},
    yaxis_title="Count",
    xaxis_title="BMI",
)

scatterplot  

In [None]:
# Initialize the figure
fig = go.Figure()

df=data
x_var = 'BMI'

# Add the overall distribution
fig.add_trace(go.Histogram(
    x=df[x_var],
    name="Overall",
    opacity=0.5,
    histnorm='probability density',  # Normalize to make densities comparable
    marker=dict(color='gray')
))

# # Add distributions for each category in the hue variable
# for hue_value in df[hue_var].unique():
#     filtered_df = df[df[hue_var] == hue_value]
#     fig.add_trace(go.Histogram(
#         x=filtered_df[x_var],
#         name=f"{hue_var}: {hue_value}",
#         opacity=0.5,
#         histnorm='probability density'
#     ))

# Update layout for better readability
fig.update_layout(
    title=f"Distribution of {x_var}",
    xaxis_title=x_var,
    yaxis_title="Density",
    barmode="overlay",  # Overlay the histograms
    template="plotly_white"
)

fig