# Exploratory data analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# data = pd.read_csv('heart_attack_youth_vs_adult.csv', index_col=0)
# data

In [3]:
from palmerpenguins import load_penguins
data = load_penguins()

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB
None


In [5]:
# Set display options for better readability
pd.options.display.float_format = '{:,.2f}'.format
data.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,342.0,342.0,342.0,342.0,344.0
mean,43.92,17.15,200.92,4201.75,2008.03
std,5.46,1.97,14.06,801.95,0.82
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.23,15.6,190.0,3550.0,2007.0
50%,44.45,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


## Create info table

In [6]:
import pandas as pd

def create_info_table(df):
    # Extract information from the DataFrame
    info_dict = {
        'Column': df.columns,
        'Dtype': df.dtypes
    }
    
    # Create a DataFrame from the extracted information
    info_df = pd.DataFrame(info_dict)
    
    # Reset index for a clean table
    info_df.reset_index(drop=True, inplace=True)
    
    return info_df

# Create and display the info table
info_table = create_info_table(data)
info_table

Unnamed: 0,Column,Dtype
0,species,object
1,island,object
2,bill_length_mm,float64
3,bill_depth_mm,float64
4,flipper_length_mm,float64
5,body_mass_g,float64
6,sex,object
7,year,int64


## List of Categorical and Numerical Columns

In [7]:
cols_categorical = data.select_dtypes(include='object').columns
print(len(cols_categorical), cols_categorical)
cols_numeric = data.select_dtypes(include='number').columns
print(len(cols_numeric), cols_numeric)

3 Index(['species', 'island', 'sex'], dtype='object')
5 Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g',
       'year'],
      dtype='object')


## Categorical Plots

### Pie charts

In [8]:
import plotly.express as px
import plotly.graph_objects as go

def pie_chart(data, input):
    # Generate the pie chart using Plotly
    fig = px.pie(
        data, 
        names=input, 
        title=f'{input}', 
        color_discrete_sequence=px.colors.qualitative.Pastel
    )
    fig.update_traces(
        textinfo='percent+label', 
        hoverinfo='label+percent',
        pull=[0.05]*len(data[input].unique())
    )
    fig.update_layout(title_font_size=20)
    return fig

pie_chart(data, 'species')

In [9]:
from plotly.subplots import make_subplots

def pie_split(data, input, target):
    # Split data into subgroups and create pie charts
    unique_targets = data[target].unique()
    colors = px.colors.qualitative.Pastel  # Specify a specific color sequence
    
    # Create a consistent color map for the unique values of the input column
    unique_values = data[input].unique()
    color_map = {val: colors[j % len(colors)] for j, val in enumerate(unique_values)}
    
    fig = make_subplots(
        rows=1, 
        cols=len(unique_targets), 
        specs=[[{'type':'domain'}]*len(unique_targets)],
        subplot_titles=[f'{target}: {val}' for val in unique_targets]
    )
    
    for i, value in enumerate(unique_targets):
        subset = data[data[target] == value]
        fig.add_trace(
            go.Pie(
                labels=subset[input].value_counts().index,
                values=subset[input].value_counts().values,
                name=f'{target}: {value}',
                pull=[0.05]*len(subset[input].unique()),
                marker=dict(colors=[color_map[val] for val in subset[input].value_counts().index])
            ),
            row=1,
            col=i+1
        )

    fig.update_layout(
        title_text=f'{input}',
        title_font_size=20,
        annotations=[
            dict(text=f'{target}: {val}', x=0.5/len(unique_targets)*(2*i+1), y=1.1, showarrow=False) 
            for i, val in enumerate(unique_targets)
        ]
    )
    return fig

pie_split(data, 'species', 'island')

### Bar charts

In [10]:
def bar_chart(data, input):
    # Create bar chart using Plotly Express
    value_counts = data[input].value_counts().reset_index()
    value_counts.columns = [input, 'count']
    fig = px.bar(
        value_counts,
        x=input,
        y='count',
        labels={input: input, 'count': 'Count'},
        title=f'{input}',
        color=input,  # Use the input column to assign different colors
        color_discrete_sequence=px.colors.qualitative.Pastel
    )
    fig.update_layout(
        title_font_size=20,
        xaxis_title=f'{input}',
        yaxis_title='Count'
    )
    return fig

fig = bar_chart(data, 'species')
fig

In [11]:
def bar_split(data, input, target):
    # Create grouped bar chart using Plotly Express
    fig = px.histogram(
        data,
        x=input,
        color=target,
        barmode='group',
        title=f'{input}',
        color_discrete_sequence=px.colors.qualitative.Pastel
    )
    fig.update_layout(
        title_font_size=20,
        xaxis_title=f'{input}',
        yaxis_title='Count'
    )
    return fig

bar_split(data, 'species', 'island')

## Numerical Plots

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

PALETTE = sns.color_palette('pastel')

import plotly.express as px
import plotly.graph_objects as go

In [13]:

def plot():  
        scatterplot = px.histogram(
            data_frame=data,
            x="body_mass_g",
            nbins=30,
        ).update_layout(
            title={"text": "Penguin Mass", "x": 0.5},
            yaxis_title="Count",
            xaxis_title="Body Mass (g)",
        )

        return scatterplot 
plot()

In [14]:
def create_distribution_plot(df, x_var, hue_var):
    """
    Creates a Plotly distribution plot for a numerical variable, 
    overlaid with the distribution for each category of a hue variable.

    Args:
        df (pd.DataFrame): The input DataFrame.
        x_var (str): The name of the numerical variable for the x-axis.
        hue_var (str): The name of the categorical variable for grouping.

    Returns:
        plotly.graph_objects.Figure: The resulting Plotly figure.
    """
    # Initialize the figure
    fig = go.Figure()

    # Add the overall distribution
    fig.add_trace(go.Histogram(
        x=df[x_var],
        name="Overall",
        opacity=0.5,
        histnorm='probability density',  # Normalize to make densities comparable
        marker=dict(color='gray')
    ))

    # Add distributions for each category in the hue variable
    for hue_value in df[hue_var].unique():
        filtered_df = df[df[hue_var] == hue_value]
        fig.add_trace(go.Histogram(
            x=filtered_df[x_var],
            name=f"{hue_var}: {hue_value}",
            opacity=0.5,
            histnorm='probability density'
        ))

    # Update layout for better readability
    fig.update_layout(
        title=f"Distribution of {x_var} by {hue_var}",
        xaxis_title=x_var,
        yaxis_title="Density",
        barmode="overlay",  # Overlay the histograms
        template="plotly_white"
    )

    return fig