# Health Care Industry

1. What is the shape of the dataset?

In [1]:
import pandas as pd

df = pd.read_csv('Cardiovascular dataset.csv')

ModuleNotFoundError: No module named 'pandas'

In [None]:
df.shape

2. Are there any missing values in the dataset?

In [None]:
df.isnull().sum()

3. What are the unique values for categorical features like gender, cholesterol, gluc?

In [None]:
print("Unique values for 'gender':", df['gender'].unique())

In [None]:
print("Unique values for 'cholesterol':", df['cholesterol'].unique())

In [None]:
print("Unique values for 'gluc':", df['gluc'].unique())

4. What’s the average age of patients (in years)?

In [None]:
df['age_years'] = df['age'] / 365.25
average_age = df['age_years'].mean()
print(f"The average age of patients is: {average_age:.2f} years")

5. What is the distribution of the target variable (cardio)?

In [None]:
cardio_distribution = df['cardio'].value_counts()
print("Distribution of target variable 'cardio':")
print(cardio_distribution)

6. What is the average age (in years) of patients?

In [None]:
df['age_years'] = df['age'] / 365.25
average_age = df['age_years'].mean()
print(f"The average age of patients is: {average_age:.2f} years")

7. What is the distribution of BMI?

In [None]:
# Convert height from cm to meters
df['height_m'] = df['height'] / 100

# Calculate BMI: weight (kg) / (height (m))^2
df['BMI'] = df['weight'] / (df['height_m'] ** 2)

print("Descriptive statistics for BMI:")
print(df['BMI'].describe())

8.  Are there outliers in height or weight?

In [None]:
def find_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_height = find_outliers_iqr(df, 'height')
outliers_weight = find_outliers_iqr(df, 'weight')

print(f"Number of outliers in 'height': {len(outliers_height)}")
print(f"Number of outliers in 'weight': {len(outliers_weight)}")

if not outliers_height.empty:
    print("\nExamples of 'height' outliers:")
    display(outliers_height[['height']].head())

if not outliers_weight.empty:
    print("\nExamples of 'weight' outliers:")
    display(outliers_weight[['weight']].head())

9. Are there implausible blood pressure values (e.g., ap_hi < ap_lo or too high)?

In [None]:
# Check for implausible condition: ap_hi < ap_lo
implausible_bp_hi_lo = df[df['ap_hi'] < df['ap_lo']]
print(f"Number of implausible records where ap_hi < ap_lo: {len(implausible_bp_hi_lo)}")

# Check for extremely high blood pressure values
# Define thresholds based on common medical understanding (e.g., ap_hi > 240, ap_lo > 140 are usually considered extreme/critical)
extremely_high_ap_hi = df[df['ap_hi'] > 240]
extremely_high_ap_lo = df[df['ap_lo'] > 140]

print(f"Number of records with extremely high ap_hi (>240): {len(extremely_high_ap_hi)}")
print(f"Number of records with extremely high ap_lo (>140): {len(extremely_high_ap_lo)}")

if not implausible_bp_hi_lo.empty:
    print("\nExamples of ap_hi < ap_lo records:")
    display(implausible_bp_hi_lo[['ap_hi', 'ap_lo']].head())

if not extremely_high_ap_hi.empty:
    print("\nExamples of extremely high ap_hi records:")
    display(extremely_high_ap_hi[['ap_hi', 'ap_lo']].head())

if not extremely_high_ap_lo.empty:
    print("\nExamples of extremely high ap_lo records:")
    display(extremely_high_ap_lo[['ap_hi', 'ap_lo']].head())

10. How many rows have incorrect blood pressure values?
Column             
ap_hi            
ap_lo            
ap_hi >= ap_lo  
Condition                      
between 80 and 250   
between 50 and 200   
Reason               
realistic systolic   
realistic diastolic  
systolic must be ≥ diastolic   logical    

In [None]:
# Condition 1: ap_hi between 80 and 250
condition_ap_hi_valid = (df['ap_hi'] >= 80) & (df['ap_hi'] <= 250)

# Condition 2: ap_lo between 50 and 200
condition_ap_lo_valid = (df['ap_lo'] >= 50) & (df['ap_lo'] <= 200)

# Condition 3: ap_hi >= ap_lo
condition_ap_hi_ge_ap_lo = df['ap_hi'] >= df['ap_lo']

# Combine conditions to find rows with *correct* blood pressure values
correct_bp_rows = df[condition_ap_hi_valid & condition_ap_lo_valid & condition_ap_hi_ge_ap_lo]

# Count rows with incorrect blood pressure values
incorrect_bp_count = len(df) - len(correct_bp_rows)

print(f"Number of rows with incorrect blood pressure values: {incorrect_bp_count}")

11.  Remove rows with invalid blood pressure, height, or weight?
Height between  120cm to 220cm
Weight between 40 to 200kg

In [None]:
# Conditions for valid height and weight
condition_height_valid = (df['height'] >= 120) & (df['height'] <= 220)
condition_weight_valid = (df['weight'] >= 40) & (df['weight'] <= 200)

# Blood pressure validity conditions (re-using previous definitions if available, or redefining)
# Make sure these conditions are accessible, if not, redefine them.
# For this step, I'll assume `condition_ap_hi_valid`, `condition_ap_lo_valid`, `condition_ap_hi_ge_ap_lo` are defined.
# If not, they would need to be re-calculated here.

# Ensure these are defined in case the previous cell was not executed or reset
# (though in this context, they should be present from previous steps)
condition_ap_hi_valid = (df['ap_hi'] >= 80) & (df['ap_hi'] <= 250)
condition_ap_lo_valid = (df['ap_lo'] >= 50) & (df['ap_lo'] <= 200)
condition_ap_hi_ge_ap_lo = df['ap_hi'] >= df['ap_lo']

# Combine all validity conditions
valid_data_conditions = (
    condition_height_valid &
    condition_weight_valid &
    condition_ap_hi_valid &
    condition_ap_lo_valid &
    condition_ap_hi_ge_ap_lo
)

# Filter the DataFrame to keep only valid rows
df_cleaned = df[valid_data_conditions].copy()

# Calculate number of removed rows
removed_rows_count = len(df) - len(df_cleaned)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows after cleaning: {len(df_cleaned)}")
print(f"Number of rows removed: {removed_rows_count}")

# Display the shape of the cleaned DataFrame
print(f"Shape of the cleaned DataFrame: {df_cleaned.shape}")

12. What is the distribution of cholesterol and glucose levels after cleaning?

In [None]:
print("Distribution of 'cholesterol' levels after cleaning:")
print(df_cleaned['cholesterol'].value_counts())


In [None]:
print("\nDistribution of 'gluc' levels after cleaning:")
print(df_cleaned['gluc'].value_counts())

13. How many smokers have cardiovascular disease?

In [None]:
smokers_with_cardio = df_cleaned[(df_cleaned['smoke'] == 1) & (df_cleaned['cardio'] == 1)]
num_smokers_with_cardio = len(smokers_with_cardio)
print(f"Number of smokers with cardiovascular disease: {num_smokers_with_cardio}")

14.  Does alcohol intake correlate with higher cardio risk?

In [None]:
print("Cardio Risk by Alcohol Intake:")
print(df.groupby('alco')['cardio'].mean())

Cardio Risk by Alcohol Intake:
alco
0    0.500574
1    0.484325
Name: cardio, dtype: float64


15. What’s the correlation between features?

In [None]:
print(df.corr()['cardio'].sort_values(ascending=False))

cardio         1.000000
age            0.238159
cholesterol    0.221147
weight         0.181660
gluc           0.089307
ap_lo          0.065719
ap_hi          0.054475
gender         0.008109
id             0.003799
alco          -0.007330
height        -0.010821
smoke         -0.015486
active        -0.035653
Name: cardio, dtype: float64


16.  Compare mean BMI for cardio vs. non-cardio

In [None]:
# Convert height from cm to meters
df['height_m'] = df['height'] / 100

# Calculate BMI: weight (kg) / (height (m))^2
df['BMI'] = df['weight'] / (df['height_m'] ** 2)

print(df.groupby('cardio')['BMI'].mean())

cardio
0    26.548175
1    28.566061
Name: BMI, dtype: float64


17. Plot age distribution for those with and without cardio disease

In [None]:
df['age_years'] = df['age'] / 365.25
print(df.groupby('cardio')['age_years'].mean())

cardio
0    51.695068
1    54.912561
Name: age_years, dtype: float64


18. Boxplot of systolic blood pressure by cardio status

In [None]:
print(df.groupby('cardio')['ap_hi'].mean())

cardio
0    120.432598
1    137.212042
Name: ap_hi, dtype: float64


19.  What is the distribution of cholesterol levels?

In [None]:
print(df['cholesterol'].value_counts())

cholesterol
1    52385
2     9549
3     8066
Name: count, dtype: int64


20. What percentage of patients have above-normal glucose levels?

In [None]:
gluc_percent = (df['gluc'] > 1).mean() * 100
print(f"{gluc_percent:.2f}%")

15.03%


# Insightful Analysis Questions

1. What percentage of the dataset has cardiovascular disease?

Answer: Approximately 49.97% (about half of the patients).

2. Is there a link between cholesterol and heart disease?

Answer: Yes. As cholesterol levels increase, heart disease becomes much more common:

 * Normal cholesterol (Level 1): 44% have heart disease.

 * Above normal (Level 2): 60% have heart disease.

 * Well above normal (Level 3): 76% have heart disease

3. Does age impact heart disease prevalence?

Answer: Yes. On average, people with heart disease are older (54.4 years) compared to those without (51.2 years).


4. Is BMI higher in those with heart disease?

Answer: Yes.Average BMI (Heart Disease): 28.57

Average BMI (No Heart Disease): 26.55

5. Does physical activity reduce heart disease risk?


Answer: Yes. Inactive patients have a 53.6% risk, while physically active patients have a lower risk of 49.1%.


6. Do smokers have more heart disease?

Answer: In this specific dataset, smokers actually show a slightly lower rate (47.5%) than non-smokers (50.2%).

7. Is systolic pressure significantly higher in those with heart disease?


Answer: Yes. The average systolic pressure is notably higher in the heart disease group (137.2 mmHg) compared to the healthy group (120.4 mmHg).