In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
import wrangle as w

# Acquire

* Data has been acquired from Kaggle at https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
* The data set contained 445,132 observations and 40 features before cleaning
* Each observation represents one person in the study
* Each feature represents an aspect of each person's health

In [2]:
df = pd.read_csv('heart_2022.csv')

# Prepare

* Dropped rows with null values (~200,000 rows)
* 246,022 rows ramain after dropping the null values
* Data was split into train, validate, and test data sets

In [3]:
df = df.dropna()

train, validate, test = w.split_my_data(df)

In [15]:
train

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Colorado,Male,Good,3.0,7.0,Within past year (anytime less than 12 months ...,Yes,6.0,"6 or more, but not all",No,...,1.88,95.25,26.96,Yes,Yes,No,No,"Yes, received tetanus shot, but not Tdap",No,No
1,Connecticut,Female,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,1.52,55.34,23.83,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
2,Florida,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,1.47,68.04,31.35,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Minnesota,Female,Good,2.0,0.0,Within past year (anytime less than 12 months ...,No,6.0,1 to 5,No,...,1.57,65.77,26.52,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Ohio,Male,Good,0.0,25.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.88,136.08,38.52,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157448,Maine,Female,Good,0.0,1.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.68,121.11,43.09,No,No,No,Yes,"Yes, received tetanus shot but not sure what type",No,No
157449,Alaska,Male,Good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,9.0,None of them,No,...,1.75,63.50,20.67,Yes,No,No,No,"Yes, received tetanus shot, but not Tdap",No,No
157450,Ohio,Female,Fair,30.0,15.0,Within past year (anytime less than 12 months ...,No,5.0,"6 or more, but not all",No,...,1.65,81.65,29.95,No,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
157451,Texas,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,15.0,1 to 5,No,...,1.70,80.00,27.68,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No


In [4]:
for col in df.columns:
    
    print(col)

State
Sex
GeneralHealth
PhysicalHealthDays
MentalHealthDays
LastCheckupTime
PhysicalActivities
SleepHours
RemovedTeeth
HadHeartAttack
HadAngina
HadStroke
HadAsthma
HadSkinCancer
HadCOPD
HadDepressiveDisorder
HadKidneyDisease
HadArthritis
HadDiabetes
DeafOrHardOfHearing
BlindOrVisionDifficulty
DifficultyConcentrating
DifficultyWalking
DifficultyDressingBathing
DifficultyErrands
SmokerStatus
ECigaretteUsage
ChestScan
RaceEthnicityCategory
AgeCategory
HeightInMeters
WeightInKilograms
BMI
AlcoholDrinkers
HIVTesting
FluVaxLast12
PneumoVaxEver
TetanusLast10Tdap
HighRiskLastYear
CovidPos


In [5]:
df.shape

(246022, 40)

In [6]:
445_132 - 246_022

199110

In [7]:
df.isnull().sum()

State                        0
Sex                          0
GeneralHealth                0
PhysicalHealthDays           0
MentalHealthDays             0
LastCheckupTime              0
PhysicalActivities           0
SleepHours                   0
RemovedTeeth                 0
HadHeartAttack               0
HadAngina                    0
HadStroke                    0
HadAsthma                    0
HadSkinCancer                0
HadCOPD                      0
HadDepressiveDisorder        0
HadKidneyDisease             0
HadArthritis                 0
HadDiabetes                  0
DeafOrHardOfHearing          0
BlindOrVisionDifficulty      0
DifficultyConcentrating      0
DifficultyWalking            0
DifficultyDressingBathing    0
DifficultyErrands            0
SmokerStatus                 0
ECigaretteUsage              0
ChestScan                    0
RaceEthnicityCategory        0
AgeCategory                  0
HeightInMeters               0
WeightInKilograms            0
BMI     

In [8]:
print(df.GeneralHealth.value_counts())

Very good    86999
Good         77409
Excellent    41525
Fair         30659
Poor          9430
Name: GeneralHealth, dtype: int64


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246022 entries, 342 to 445130
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              

In [10]:
print(df.HighRiskLastYear.value_counts())

No     235446
Yes     10576
Name: HighRiskLastYear, dtype: int64


In [11]:
for col in df.columns:

    print(col)

State
Sex
GeneralHealth
PhysicalHealthDays
MentalHealthDays
LastCheckupTime
PhysicalActivities
SleepHours
RemovedTeeth
HadHeartAttack
HadAngina
HadStroke
HadAsthma
HadSkinCancer
HadCOPD
HadDepressiveDisorder
HadKidneyDisease
HadArthritis
HadDiabetes
DeafOrHardOfHearing
BlindOrVisionDifficulty
DifficultyConcentrating
DifficultyWalking
DifficultyDressingBathing
DifficultyErrands
SmokerStatus
ECigaretteUsage
ChestScan
RaceEthnicityCategory
AgeCategory
HeightInMeters
WeightInKilograms
BMI
AlcoholDrinkers
HIVTesting
FluVaxLast12
PneumoVaxEver
TetanusLast10Tdap
HighRiskLastYear
CovidPos


In [12]:
df.HadKidneyDisease.value_counts()

No     234738
Yes     11284
Name: HadKidneyDisease, dtype: int64

In [13]:
df.HadStroke.value_counts()

No     235910
Yes     10112
Name: HadStroke, dtype: int64

In [14]:
train, validate, test = w.get_my_data()

FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_2020.csv'

# Prepare

* There were no null values in this data set
* Features were renamed for convention and readability
* Dummy variables were created for catagorical features for use in modeling
* Data was split into train, validate, and test dataframes stratifying on heart disease ~(64/16/20)
* Scaled versions of numeric variables were added to the dataframe for use in modeling

In [None]:
train.bmi_cat.value_counts()

In [None]:
num_cols = ['bmi',
            'poor_physical_health_days',
            'poor_mental_health_days',
            'sleep_hours']


cat_cols = ['heart_disease',
            'smoker',
            'heavy_drinker',
            'difficulty_walking',
            'diabetic',
            'physical_activity',
            'kidney_disease',
            'skin_cancer',
            'age_category',
            'general_health',
            'stroke',
            'sex',
            'race',
            'asthma',
            'bmi_cat']

In [None]:
for col in num_cols:
    
    over = train[col].mean()
    heart = train[col][train.heart_disease == 'Yes'].mean()
    health = train[col][train.heart_disease == 'No'].mean()
    
    x = ['Overall', 'Heart-attack', 'Non-heart-attack']
    hight = [over, heart, health]
    
    plt.bar(x, hight)
    plt.title(f'avg {col}')
    plt.show()

In [None]:


for col in cat_cols:

    vals = list(set(train[col].to_list()))
    hights = []

    for val in vals:
        
        num_val_heart = len(train[(train[col] == val) & (train['heart_disease'] == "Yes")])
        num_val = len(train[train[col] == val])

        print(col, val, num_val_heart, num_val)
        per_heart = round((num_val_heart/num_val),2)*100
        
        hights.append(per_heart)
        
    plt.bar(vals, hights)
    plt.title(f'heart attack percentage for {col}')
    plt.show()

In [None]:
    vals = list(set(train[col].to_list()))
    hights = []

    for val in vals:

        heart_per = int(round(len(train[(train['bmi_cat'] == val) & (train['heart_disease'] == "Yes")])/len(train[(train[col] == val)]),2)*100)
        
        hights.append(heart_per)
        
    plt.bar(vals, hights)
    plt.title(f'heart attack percentage for {col}')
    plt.show()

In [None]:
0/12

In [None]:
import pandas as pd
import re

# Sample DataFrame with columns containing numbers
data = {'col_1': [1, 2, 3], 
        'col_10': [4, 5, 6], 
        'col_2': [7, 8, 9]}
df = pd.DataFrame(data)

# Function to extract and convert numerical part from column names
def extract_number(col_name):
    match = re.search(r'\d+', col_name)
    if match:
        return int(match.group())
    return float('inf') # Handle cases where no number is found (place at the end)

# Sort columns by the numerical part of their names
sorted_df = df.sort_index(axis=1, key=lambda x: x.map(extract_number))

print(sorted_df)