In [103]:
import pandas as pd

# Load the dataset
heart_2020 = "C:/Users/Meriya/Downloads/Indicators of heart disease/2020/heart_2020_data.csv"
heart_2022 = "C:/Users/Meriya/Downloads/Indicators of heart disease/2022/heart_2022_data.csv"

heart_2020 = pd.read_csv(heart_2020)
heart_2022 = pd.read_csv(heart_2022)

In [104]:
# Standardizing the 2022 data based on the 2020 structure
relevant_columns_mapping = {
    'HadHeartAttack': 'HeartDisease',
    'BMI': 'BMI',
    'SmokerStatus': 'Smoking',
    'AlcoholDrinkers': 'AlcoholDrinking',
    'HadStroke': 'Stroke',
    'DifficultyWalking': 'DiffWalking',
    'Sex': 'Sex',
    'AgeCategory': 'AgeCategory',
    'RaceEthnicityCategory': 'Race',
    'HadDiabetes': 'Diabetic',
    'PhysicalActivities': 'PhysicalActivity',
    'GeneralHealth': 'GenHealth',
    'SleepHours': 'SleepTime',
    'HadAsthma': 'Asthma',
    'HadKidneyDisease': 'KidneyDisease',
    'HadSkinCancer': 'SkinCancer',
    'PhysicalHealthDays': 'PhysicalHealth',
    'MentalHealthDays': 'MentalHealth'
}

# Subset and rename columns for the 2022 dataset
heart_2022_standardized = heart_2022[list(relevant_columns_mapping.keys())]
heart_2022_standardized.rename(columns=relevant_columns_mapping, inplace=True)

# Display the first few rows of the standardized 2022 data
heart_2022_standardized.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_2022_standardized.rename(columns=relevant_columns_mapping, inplace=True)


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,PhysicalHealth,MentalHealth
0,No,,Never smoked,No,No,No,Female,Age 80 or older,"White only, Non-Hispanic",Yes,No,Very good,8.0,No,No,No,0.0,0.0
1,No,26.57,Never smoked,No,No,No,Female,Age 80 or older,"White only, Non-Hispanic",No,No,Excellent,6.0,No,No,Yes,0.0,0.0
2,No,25.61,Never smoked,No,No,No,Female,Age 55 to 59,"White only, Non-Hispanic",No,Yes,Very good,5.0,No,No,Yes,2.0,3.0
3,No,23.3,Current smoker - now smokes some days,No,No,No,Female,,"White only, Non-Hispanic",No,Yes,Excellent,7.0,Yes,No,No,0.0,0.0
4,No,21.77,Never smoked,Yes,No,No,Female,Age 40 to 44,"White only, Non-Hispanic",No,Yes,Fair,9.0,No,No,No,2.0,0.0


In [105]:
# Concatenate the datasets row-wise
heart_combined = pd.concat([heart_2020, heart_2022_standardized], axis=0, ignore_index=True)

# Display the shape and head of the combined dataset
print("Combined dataset shape:", heart_combined.shape)
print(heart_combined.head())

Combined dataset shape: (764927, 18)
  HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.60     Yes              No     No             3.0   
1           No  20.34      No              No    Yes             0.0   
2           No  26.58     Yes              No     No            20.0   
3           No  24.21      No              No     No             0.0   
4           No  23.71      No              No     No            28.0   

   MentalHealth DiffWalking     Sex  AgeCategory   Race Diabetic  \
0          30.0          No  Female        55-59  White      Yes   
1           0.0          No  Female  80 or older  White       No   
2          30.0          No    Male        65-69  White      Yes   
3           0.0          No  Female        75-79  White       No   
4           0.0         Yes  Female        40-44  White       No   

  PhysicalActivity  GenHealth  SleepTime Asthma KidneyDisease SkinCancer  
0              Yes  Very good        5.0    Ye

In [106]:
# Calculate the percentage of missing values for each row
missing_percentage = heart_combined.isnull().mean(axis=1)

# Set the threshold (50%)
threshold = 0.5

# Filter out rows where the percentage of missing values is greater than the threshold
heart_combined = heart_combined[missing_percentage <= threshold]

# Display the shape and head of the filtered dataset
print("Filtered dataset shape:", heart_combined.shape)
print(heart_combined.head())


Filtered dataset shape: (764741, 18)
  HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.60     Yes              No     No             3.0   
1           No  20.34      No              No    Yes             0.0   
2           No  26.58     Yes              No     No            20.0   
3           No  24.21      No              No     No             0.0   
4           No  23.71      No              No     No            28.0   

   MentalHealth DiffWalking     Sex  AgeCategory   Race Diabetic  \
0          30.0          No  Female        55-59  White      Yes   
1           0.0          No  Female  80 or older  White       No   
2          30.0          No    Male        65-69  White      Yes   
3           0.0          No  Female        75-79  White       No   
4           0.0         Yes  Female        40-44  White       No   

  PhysicalActivity  GenHealth  SleepTime Asthma KidneyDisease SkinCancer  
0              Yes  Very good        5.0    Ye

In [107]:
from sklearn.impute import SimpleImputer
import numpy as np

# Separate numerical and categorical columns
numerical_cols = heart_combined.select_dtypes(include=np.number).columns
categorical_cols = heart_combined.select_dtypes(exclude=np.number).columns

# Impute numerical columns with the median to preserve integer-like values
num_imputer = SimpleImputer(strategy='median')
heart_combined[numerical_cols] = num_imputer.fit_transform(heart_combined[numerical_cols])

# Ensure numerical columns are integers if originally integer-like
for col in numerical_cols:
    if (heart_combined[col] % 1 == 0).all():  # Check if all values are whole numbers
        heart_combined[col] = heart_combined[col].astype(int)

# Impute categorical columns with the mode
cat_imputer = SimpleImputer(strategy='most_frequent')
heart_combined[categorical_cols] = cat_imputer.fit_transform(heart_combined[categorical_cols])

# Display the head of the imputed dataset
print("Imputed dataset shape:", heart_combined.shape)
print(heart_combined.head())


Imputed dataset shape: (764741, 18)
  HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.60     Yes              No     No               3   
1           No  20.34      No              No    Yes               0   
2           No  26.58     Yes              No     No              20   
3           No  24.21      No              No     No               0   
4           No  23.71      No              No     No              28   

   MentalHealth DiffWalking     Sex  AgeCategory   Race Diabetic  \
0            30          No  Female        55-59  White      Yes   
1             0          No  Female  80 or older  White       No   
2            30          No    Male        65-69  White      Yes   
3             0          No  Female        75-79  White       No   
4             0         Yes  Female        40-44  White       No   

  PhysicalActivity  GenHealth  SleepTime Asthma KidneyDisease SkinCancer  
0              Yes  Very good          5    Yes

In [108]:
heart_combined.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [109]:
def encode_heart_disease(data):
    # Map "Yes" to 1 and "No" to 0
    heart_disease_mapping = {
        "Yes": 1,
        "No": 0
    }
    data['HeartDisease'] = data['HeartDisease'].map(heart_disease_mapping)
    return data

# Apply the function to the combined dataset
heart_combined = encode_heart_disease(heart_combined)

# Check the updated HeartDisease column
print(heart_combined['HeartDisease'].value_counts())


HeartDisease
0    712260
1     52481
Name: count, dtype: int64


In [110]:
def encode_smoking_column_with_clarity(data):
    """
    Encodes the Smoking column with severity levels:
    0 - Non-smoker (Never smoked, No)
    1 - Former smoker
    2 - Occasional smoker
    3 - Frequent smoker (or active smoker)
    """
    # Define a mapping for the Smoking column
    smoking_mapping = {
        "Never smoked": 0,                           # Non-smoker
        "No": 0,                                     # Non-smoker
        "Former smoker": 1,                          # Used to smoke
        "Current smoker - now smokes some days": 2,  # Occasional smoker
        "Current smoker - now smokes every day": 3,  # Frequent smoker
        "Yes": 3                                     # Active smoker (assumed frequent)
    }
    
    # Track unexpected values
    unexpected_values = data[~data['Smoking'].isin(smoking_mapping.keys())]['Smoking'].unique()

    # Apply the mapping and handle missing/unexpected values
    data['Smoking'] = data['Smoking'].map(smoking_mapping).fillna(-1)  # Assign -1 for unexpected/missing values

    # Optionally, log unexpected values
    if len(unexpected_values) > 0:
        print(f"Warning: Found unexpected values in Smoking column: {unexpected_values}")
    
    return data

# Apply the function to the dataset
heart_combined = encode_smoking_column_with_clarity(heart_combined)

# Display the updated Smoking column
print(heart_combined['Smoking'].value_counts())


Smoking
0    469125
3    167909
1    113770
2     13937
Name: count, dtype: int64


In [111]:
import re

def standardize_age_category(data):
    def format_age(age_str):
        # Extract numbers using regular expressions
        match = re.findall(r'\d+', age_str)
        if len(match) == 2:  # Format ranges like "18-24"
            return f"{match[0]}-{match[1]}"
        elif len(match) == 1:  # Format "80 or older" as "80+"
            return f"{match[0]}+"
        else:
            return None  # Handle unexpected formats

    # Apply formatting function to the AgeCategory column
    data['AgeCategory'] = data['AgeCategory'].apply(format_age)
    return data

# Apply the function to the datasets
heart_combined = standardize_age_category(heart_combined)

# Display the standardized AgeCategory column
print(heart_combined['AgeCategory'].value_counts())


AgeCategory
65-69    90178
60-64    78191
70-74    74533
55-59    66576
80+      60394
50-54    59025
75-79    53998
40-44    50945
45-49    50318
35-39    49075
18-24    48004
30-34    44559
25-29    38945
Name: count, dtype: int64


In [112]:
import re

def encode_diabetic_column_with_categories(data):
    def map_diabetic(value):
        # Convert to lowercase for case-insensitive matching
        value = str(value).lower()  # Convert to string to handle non-string values
        if re.search(r'\byes\b', value) and 'pregnancy' in value:
            return 3  # Pregnancy-related
        elif re.search(r'\byes\b', value):
            return 1  # Yes
        elif re.search(r'\bno\b', value) and 'borderline' in value:
            return 2  # Borderline
        elif re.search(r'\bno\b', value):
            return 0  # No
        else:
            return None  # For unexpected values

    # Use .loc to modify the column safely
    data.loc[:, 'Diabetic'] = data['Diabetic'].apply(map_diabetic)
    return data

# Apply the function to the dataset
heart_combined = encode_diabetic_column_with_categories(heart_combined)

# Display the updated counts for the Diabetic column
print(heart_combined['Diabetic'].value_counts())


Diabetic
0    639278
1    101958
2     17110
3      6395
Name: count, dtype: int64


In [113]:
def encode_genhealth_column(data):
    # Define the mapping for GenHealth
    genhealth_mapping = {
        "Excellent": 5,
        "Very good": 4,
        "Good": 3,
        "Fair": 2,
        "Poor": 1
    }
    # Apply the mapping to the GenHealth column
    data.loc[:, 'GenHealth'] = data['GenHealth'].map(genhealth_mapping)
    return data

# Apply the function to the dataset
heart_combined = encode_genhealth_column(heart_combined)

# Display the updated GenHealth column
print(heart_combined['GenHealth'].value_counts())


GenHealth
4    263407
3    236692
5    138683
2     94936
1     31023
Name: count, dtype: int64


In [114]:
def encode_sex_column(data):
    # Define the mapping for the Sex column
    sex_mapping = {
        "Female": 0,
        "Male": 1
    }
    # Apply the mapping to the Sex column
    data.loc[:, 'Sex'] = data['Sex'].map(sex_mapping)
    return data

# Apply the function to your datasets
heart_combined = encode_sex_column(heart_combined)

# Display the updated Sex column
print(heart_combined['Sex'].value_counts())


Sex
0    403608
1    361133
Name: count, dtype: int64


In [116]:
# Save the cleaned dataset to a CSV file
cleaned_csv_path = "cleaned_heart_data.csv"  # Change the path as needed
heart_combined.to_csv(cleaned_csv_path, index=False)

print(f"Cleaned dataset saved to {cleaned_csv_path}")


Cleaned dataset saved to cleaned_heart_data.csv
