In [1]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

In [2]:
# Load dataset
df = pd.read_csv('heart_statlog_cleveland_hungary_final.csv')
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1


In [3]:
# Drop duplicates
df.drop_duplicates(inplace=True)
# Recode sex columns
df['sex'] = df['sex'].replace({
    1: 'male',
    0: 'female'
})
df.sex.value_counts()
# Recode chest pain type columns
df['chest pain type'] = df['chest pain type'].replace({
    1: 'typical angina',
    2: 'atypical angina',
    3: 'non-anginal pain',
    4: 'asymptomatic'
})
df['chest pain type'].value_counts()
# Recode fasting blood sugar columns
df['fasting blood sugar'] = df['fasting blood sugar'].replace({
    1: '> 120 mg/dl',
    0: '< 120 mg/dl'
})
df['fasting blood sugar'].value_counts()
# Recode resting ecg olumns
df['resting ecg'] = df['resting ecg'].replace({
    0: 'normal',
    1: 'ST-T wave abnormality',
    2: 'left ventricular hypertrophy'
})
df['resting ecg'].value_counts()
# Recode exercise angina columns
df['exercise angina'] = df['exercise angina'].replace({
    1: 'yes',
    0: 'no'
})
df['exercise angina'].value_counts()
# Recode st slope columns
df['ST slope'] = df['ST slope'].replace({
    1: 'upsloping',
    2: 'flat',
    3: 'downsloping',
    0: 'unknown'
})
df['ST slope'].value_counts()
# Recode target column
df['target'] = df['target'].replace({
    0: 'no disease',
    1: 'disease'
})
df['target'].value_counts()

target
disease       508
no disease    410
Name: count, dtype: int64

In [11]:
df = df.rename(columns={'target': "disease status"})
# Create age group column
df['age group'] = pd.cut(df['age'], bins=[0, 30, 40, 50, 60, 70, 80], labels=['0-29', '30-39', '40-49', '50-59', '60-69', '70-79'])
df['age group'].value_counts()
# Create cholesterol group column
df['cholesterol group'] = pd.cut(df['cholesterol'], bins=[0, 150, 200, 240, 300, 1000], labels=['<150 mg/dL', '150-199 mg/dL', '200-239 mg/dL', '240-299 mg/dL', '≥300 mg/dL'], right=False)
df['cholesterol group'].value_counts()

cholesterol group
240-299 mg/dL    258
200-239 mg/dL    237
<150 mg/dL       192
150-199 mg/dL    126
≥300 mg/dL       105
Name: count, dtype: int64

In [6]:
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,male,atypical angina,140,289,< 120 mg/dl,normal,172,no,0.0,upsloping,no disease
1,49,female,non-anginal pain,160,180,< 120 mg/dl,normal,156,no,1.0,flat,disease
2,37,male,atypical angina,130,283,< 120 mg/dl,ST-T wave abnormality,98,no,0.0,upsloping,no disease
3,48,female,asymptomatic,138,214,< 120 mg/dl,normal,108,yes,1.5,flat,disease
4,54,male,non-anginal pain,150,195,< 120 mg/dl,normal,122,no,0.0,upsloping,no disease
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45,male,typical angina,110,264,< 120 mg/dl,normal,132,no,1.2,flat,disease
1186,68,male,asymptomatic,144,193,> 120 mg/dl,normal,141,no,3.4,flat,disease
1187,57,male,asymptomatic,130,131,< 120 mg/dl,normal,115,yes,1.2,flat,disease
1188,57,female,atypical angina,130,236,< 120 mg/dl,left ventricular hypertrophy,174,no,0.0,flat,disease


In [9]:
max_hr = str(df[df['target'] == 'disease']['max heart rate'].max())
max_hr

'195'

In [11]:

num_exerc_angina = str(df[df['target'] == 'disease']['exercise angina'].value_counts()[0])
num_exerc_angina

  num_exerc_angina = str(df[df['target'] == 'disease']['exercise angina'].value_counts()[0])


'316'

In [13]:
df = df.rename(columns={'target': "disease status"})

In [14]:
def get_metrics_card_info(df: pd.DataFrame) -> int:
    """
    Get info for metrics card
    Params:
        df: DataFrame
    Returns:
        str
    """
    no_participants = str(df.shape[0])
    num_disease = str(df['disease status'].value_counts().values[0])
    max_hr = str(df[df['disease status'] == 'disease']['max heart rate'].max())
    num_exerc_angina = str(df[df['disease status'] == 'disease']['exercise angina'].value_counts()[0])
    return no_participants, num_disease, max_hr, num_exerc_angina

get_metrics_card_info(df)

  num_exerc_angina = str(df[df['disease status'] == 'disease']['exercise angina'].value_counts()[0])


('918', '508', '195', '316')

In [30]:
# Group by mean
df = df.groupby(['sex', 'disease status']).size().reset_index(name='count')
fig = px.bar(data_frame=df, x='sex', y='count', color='disease status',
                 title='Gender vs Disease Status', text_auto=True, barmode='group')

fig.update_layout(
        width=900,
        height=500,
        xaxis_title='Gender',
        title_x=0.5,
    )

fig.update_traces(
        textposition='outside',
        )

In [None]:
def plot_bar():
    pass

In [None]:
def plot_max_hr():
    max_heart_rate_counts = df['max heart rate'].value_counts().sort_index()
    fig_max_heart_rate = px.bar(
    x=max_heart_rate_counts.index, y=max_heart_rate_counts.values,
    labels={'x': 'Max Heart Rate', 'y': 'Number of Individuals'},
    title="Max Heart Rate",
    color_discrete_sequence=['orange'] # adjust this color and dark theme
    )
    return fig_max_heart_rate
plot_max_hr()

In [31]:
df

Unnamed: 0,sex,disease status,count
0,female,disease,50
1,female,no disease,143
2,male,disease,458
3,male,no disease,267


In [139]:
def plot_hist(df: str, xcol: str, title: str, x_title: str, bins: int = 20) -> None:
    """
    Plots a histogram
    Params:
        df: DataFrame
        xcol: str
        title: str
        x_title: str
    Returns:
        None
    """
    fig  = px.histogram(df, x=xcol, title=title, nbins=bins)
    fig.update_layout(
        bargap=0.01,
        width=800,
        height=500,
        yaxis_title='Frequency',
        xaxis_title=x_title)
    
    return fig

In [170]:
def plot_pie_chart(df: pd.DataFrame, xcol: str, title: str) -> None:
    """
    Plots a pie chart
    Params:
        df: DataFrame
        xcol: str
        title: str
    Returns:
        None
    """
    fig = px.pie(data_frame=df, names=xcol, title=title,  hole=0.6, width=600, height=400)
    fig.update_traces(
    textinfo='label + percent',  # Show percentage and label
    textposition='outside',     # Place text inside the slices
    )

    fig.update_layout(
    showlegend=False,
    title_x=0.5,
    )
    
    return fig

# bar_count = df['sex'].value_counts().reset_index()
# bar_count
fig = px.pie(data_frame=df, names='sex', title='Disease Status', hole=0.6, width=600, height=400)
# Customize annotations
fig.update_traces(
    textinfo='label + percent',  # Show percentage and label
    textposition='outside',     # Place text inside the slices
)

fig.update_layout(
    showlegend=False,
    title_x=0.5,
)
fig.show()

In [166]:
def plot_multivar_bar(df: pd.DataFrame, xcol: str, color: str, title: str, x_title: str) -> None:
    """
    Plots a bar chart
    Params:
        df: DataFrame
        xcol: str
        title: str
        x_title: str
    Returns:
        None
    """
    # Group by mean
    df = df.groupby([xcol, color]).size().reset_index(name='count')
    fig = px.bar(data_frame=df, x=xcol, y='count', color=color,
                 title=title, text_auto=True, barmode='group')

    fig.update_layout(
        width=900,
        height=500,
        xaxis_title=x_title,
        title_x=0.5,
    )

    fig.update_traces(
        textposition='outside',
        )

    return fig


# plot_bar(df, 'age group', 'Age Group Distribution', 'Age Group')

In [169]:
plot_multivar_bar(df, 'age group', 'fasting blood sugar', 'Age Group By Sex', 'Age Group')
# # mm = df.groupby(['age group', 'sex']).size().reset_index(name='count')
# mm = df.groupby(['age group', 'sex']).size().reset_index(name='count')
# mm





In [143]:
px.bar(data_frame=mm, x='age group', y='count', color='sex', barmode='group', text_auto=True)

In [144]:
# plot_hist(df, 'max heart rate', 'max heart rate Distribution', 'max heart rate')

In [145]:
# fig = px.histogram(df, x='age', title='Age Distribution', nbins=20)
# fig.update_layout(bargap=0.01,
#                   width=800,
#                   height=500,
#                   xaxis_title='Age',
#                   yaxis_title='Frequency')
# fig.show()

In [146]:
# fig = px.histogram(df, x='age', color='target', marginal='box', title='Distribution of Age by Target')
# fig.show()

In [147]:
df = df.rename(columns={'resting bp s': 'resting blood pressure'})

In [148]:
def aggregate_columns(df: pd.DataFrame, group_col: str) -> pd.DataFrame:
    """
    Aggregates columns
    Params:
        df: DataFrame
        group_col: str
    Returns:
        DataFrame
    """
    df = df.groupby(group_col).agg(
            MinRBP=('resting blood pressure', 'min'),
            MaxRBP=('resting blood pressure', 'max'),
            MeanRBP=('resting blood pressure', 'mean'),
            MinMHR=('max heart rate', 'min'),
            MaxMHR=('max heart rate', 'max'),
            MeanMHR=('max heart rate', 'mean'),
            MinChol=('cholesterol', 'min'),
            MaxChol=('cholesterol', 'max'),
            MeanChol=('cholesterol', 'mean')).reset_index().round(2)
    return df

aggregate_columns(df, 'age group')





Unnamed: 0,age group,MinRBP,MaxRBP,MeanRBP,MinMHR,MaxMHR,MeanMHR,MinChol,MaxChol,MeanChol
0,0-29,120,170,138.0,160,202,177.4,132,263,215.8
1,30-39,92,190,123.74,80,192,155.06,0,529,206.6
2,40-49,100,180,128.16,90,194,144.76,0,491,215.99
3,50-59,0,200,133.57,60,195,134.29,0,603,197.11
4,60-69,95,200,137.77,67,179,125.61,0,564,179.97
5,70-79,104,170,139.62,94,162,119.5,0,310,188.29


In [149]:
df.head()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target,age group,cholesterol group
0,40,male,atypical angina,140,289,< 120 mg/dl,normal,172,no,0.0,upsloping,no disease,30-39,240-299 mg/dL
1,49,female,non-anginal pain,160,180,< 120 mg/dl,normal,156,no,1.0,flat,disease,40-49,150-199 mg/dL
2,37,male,atypical angina,130,283,< 120 mg/dl,ST-T wave abnormality,98,no,0.0,upsloping,no disease,30-39,240-299 mg/dL
3,48,female,asymptomatic,138,214,< 120 mg/dl,normal,108,yes,1.5,flat,disease,40-49,200-239 mg/dL
4,54,male,non-anginal pain,150,195,< 120 mg/dl,normal,122,no,0.0,upsloping,no disease,50-59,150-199 mg/dL


In [150]:
# Aggregate num cols
df.groupby('target').agg(
    MakDD=('max heart rate', 'mean')
)

Unnamed: 0_level_0,MakDD
target,Unnamed: 1_level_1
disease,127.655512
no disease,148.15122


In [151]:
# General sum stats
sumt = df.describe().round(2).T
sumt.drop(['count', '25%', '75%'], axis=1, inplace=True)
sumt.rename(columns={'50%': 'median'}, inplace=True)
sumt

Unnamed: 0,mean,std,min,median,max
age,53.51,9.43,28.0,54.0,77.0
resting blood pressure,132.4,18.51,0.0,130.0,200.0
cholesterol,198.8,109.38,0.0,223.0,603.0
max heart rate,136.81,25.46,60.0,138.0,202.0
oldpeak,0.89,1.07,-2.6,0.6,6.2


In [152]:
df.shape

(918, 14)

In [161]:
df['exercise angina'].value_counts().values[1]

371

In [162]:
df['target'].value_counts()

target
disease       508
no disease    410
Name: count, dtype: int64