In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Import data

In [2]:
df = pd.read_csv('medical_examination.csv')

### Add 'overweight' column

In [3]:
# Add an "overweight" column based on BMI calculation
df['overweight'] = ((df['weight'] / ((df['height'] / 100) ** 2)) > 25).astype(int)

# Display the head of updated DataFrame
df.head

<bound method NDFrame.head of           id    age  sex  height  weight  ap_hi  ap_lo  cholesterol  gluc  \
0          0  18393    2     168    62.0    110     80            1     1   
1          1  20228    1     156    85.0    140     90            3     1   
2          2  18857    1     165    64.0    130     70            3     1   
3          3  17623    2     169    82.0    150    100            1     1   
4          4  17474    1     156    56.0    100     60            1     1   
...      ...    ...  ...     ...     ...    ...    ...          ...   ...   
69995  99993  19240    2     168    76.0    120     80            1     1   
69996  99995  22601    1     158   126.0    140     90            2     2   
69997  99996  19066    2     183   105.0    180     90            3     1   
69998  99998  22431    1     163    72.0    135     80            1     2   
69999  99999  20540    1     170    72.0    120     80            2     1   

       smoke  alco  active  cardio  overweigh

### Normalize data by making 0 always good and 1 always bad. 

In [4]:
# If the value of 'cholesterol' or 'gluc' is 1, make the value 0. 
# If the value is more than 1, make the value 1.
df['cholesterol'] = df['cholesterol'].apply(lambda x: 0 if x == 1 else 1)
df['gluc'] = df['gluc'].apply(lambda x: 0 if x == 1 else 1)
df[['cholesterol', 'gluc']]

Unnamed: 0,cholesterol,gluc
0,0,0
1,1,0
2,1,0
3,0,0
4,0,0
...,...,...
69995,0,0
69996,1,1
69997,1,0
69998,0,1


Convert the data into long format and create a chart that shows the value counts of the categorical features using seaborn's catplot(). 

The dataset should be split by 'Cardio' so there is one chart for each cardio value. 

The chart should look like examples/Figure_1.png.

In [5]:
df.iloc[:, 0:14]

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight
0,0,18393,2,168,62.0,110,80,0,0,0,0,1,0,0
1,1,20228,1,156,85.0,140,90,1,0,0,0,1,1,1
2,2,18857,1,165,64.0,130,70,1,0,0,0,0,1,0
3,3,17623,2,169,82.0,150,100,0,0,0,0,1,1,1
4,4,17474,1,156,56.0,100,60,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,0,0,1,0,1,0,1
69996,99995,22601,1,158,126.0,140,90,1,1,0,0,1,1,1
69997,99996,19066,2,183,105.0,180,90,1,0,0,1,0,1,1
69998,99998,22431,1,163,72.0,135,80,0,1,0,0,0,1,1


In [6]:
df_cat = df[['cholesterol', 'gluc', 'smoke', 'alco', 'active', 'overweight']]
df_cat.iloc[:, 0:6]

Unnamed: 0,cholesterol,gluc,smoke,alco,active,overweight
0,0,0,0,0,1,0
1,1,0,0,0,1,1
2,1,0,0,0,0,0
3,0,0,0,0,1,1
4,0,0,0,0,0,0
...,...,...,...,...,...,...
69995,0,0,1,0,1,1
69996,1,1,0,0,1,1
69997,1,0,0,1,0,1
69998,0,1,0,0,0,1


In [7]:
# Draw Categorical Plot
def draw_cat_plot():
    # Create DataFrame for cat plot using `pd.melt` using just the values from 'cholesterol', 'gluc', 'smoke', 'alco', 'active', and 'overweight'.
    df_cat = None


    # Group and reformat the data to split it by 'cardio'. Show the counts of each feature. You will have to rename one of the columns for the catplot to work correctly.
    df_cat = None
    

    # Draw the catplot with 'sns.catplot()'



    # Get the figure for the output
    fig = None


    # Do not modify the next two lines
    fig.savefig('catplot.png')
    return fig

In [None]:
def draw_cat_plot():
    # Create DataFrame for cat plot using `pd.melt` using just the values from 'cholesterol', 'gluc', 'smoke', 'alco', 'active', and 'overweight'.
    df_cat = pd.melt(df, id_vars='cardio', value_vars=['cholesterol', 'gluc', 'smoke', 'alco', 'active', 'overweight'])

    # Group and reformat the data to split it by 'cardio'. Show the counts of each feature.
    df_cat = df_cat.groupby(['cardio', 'variable', 'value']).size().reset_index(name='total')

    # Rename one of the columns for the catplot to work correctly
    df_cat['variable'] = df_cat['variable'].map({
        'cholesterol': 'Cholesterol',
        'gluc': 'Glucose',
        'smoke': 'Smoking',
        'alco': 'Alcohol intake',
        'active': 'Physical activity',
        'overweight': 'Overweight'
    })

    # Draw the catplot with 'sns.catplot()'
    g = sns.catplot(x='variable', y='total', hue='value', data=df_cat, kind='bar', col='cardio')
    
    # Customize the plot
    g.set_axis_labels("Variable", "Total")
    g.set_titles("Cardio={col_name}")
    g.set(yscale='log')
    
    # Get the figure for the output
    fig = g.fig
    
    # Save the figure
    fig.savefig('catplot.png')
    
    return fig

# Call the function to generate the plot
draw_cat_plot()
