In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/health-screening-data/Health Screening Data.csv")

In [None]:
df.columns

In [None]:
# Getting the list of column names
column_names = df.columns.tolist()

# Print the list of column names
print(column_names)

In [None]:
#To get the understanding of the columns
print(df.describe())

In [None]:
#To know the types of the attributes
print(df.dtypes)

In [None]:
#Preprocessing steps
#To check for null values 
print(df.isnull().sum())

In [None]:
#To check for missing values
missing_values = ["n/a", "na", "--", ' ', '?']
df.replace(missing_values, pd.NA, inplace=True)

# Count missing values
missing_count = df.isna().sum().sum()
print("Total missing values:", missing_count)

In [None]:
#To check for duplicates
duplicates = df[df.duplicated()]

# Check if there are any duplicates
if duplicates.empty:
    print("No duplicates found.")
else:
    print("Duplicates found.")
    print(duplicates)

In [None]:
#since there are no missing values and duplicates
#further preprocessing will be carried out
#removing outliers (1)

import numpy as np

# Function for removing outliers using MAD method
def remove_outliers_mad(df, columns, threshold=3.5):
    for column in columns:
        median = df[column].median()
        mad = np.median(np.abs(df[column] - median))
        upper_bound = median + threshold * mad
        lower_bound = median - threshold * mad
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Choose numerical columns containing potential outliers
numerical_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Removing the outliers using MAD method
df_cleaned_mad = remove_outliers_mad(df, numerical_columns)

# Print the shape of the DataFrame before and after removing outliers
print("Shape before removing outliers (MAD method):", df.shape)
print("Shape after removing outliers (MAD method):", df_cleaned_mad.shape)


In [None]:
#further preprocessing
#scaling of numercial values through standardization(2)

from sklearn.preprocessing import StandardScaler

# Represent the StandardScaler
scaler = StandardScaler()

# Defining numerical columns to be scaled
numerical_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Apply standardization to the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Printing the scaled numerical columns
print("Scaled Numerical Columns:")
print(df[numerical_columns].head())

# Printing the mean and standard deviation of the scaled numerical columns
print("\nMean of Scaled Numerical Columns:")
print(df[numerical_columns].mean())
print("\nStandard Deviation of Scaled Numerical Columns:")
print(df[numerical_columns].std())


In [None]:
#further preprocessing
# ordinal encoding of BMICat
from sklearn.preprocessing import OrdinalEncoder

# Defining the ordinal categories in the desired order
bmi_categories = ['Under Weight', 'Normal', 'Over Weight', 'Obese']

# Initializing the OrdinalEncoder with the corrected categories
encoder = OrdinalEncoder(categories=[bmi_categories])

#Now, Reshape the BMICat column to a 2D array
bmi_cat_encoded = df['BMICat'].values.reshape(-1, 1)

# Fit and transform the BMICat column using ordinal encoding
df['BMICat_encoded'] = encoder.fit_transform(bmi_cat_encoded)

# Print the first few rows to verify the encoding
print(df[['BMICat', 'BMICat_encoded']].head())


In [None]:
#Now to Visualisation
#Visualisation 1: Understanding Age Distribution using KDE Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of seaborn
sns.set(style="whitegrid")

# Creating a KDE plot for age distribution
plt.figure(figsize=(8, 6))
sns.kdeplot(data=df['AgeinYr'], fill=True, color='royalblue')
plt.title('Age Distribution (KDE Plot)')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()




In [None]:
#Visualisation 2: Count plot of Body Mass Index Category and presence of cardiovascular disease

import seaborn as sns
import matplotlib.pyplot as plt
#creating countplot for BMI showing cardiovascular disease
sns.countplot(x='BMICat_encoded', hue='cardio', data=df, palette='coolwarm')
plt.title('Body Mass Index and Cardiovascular Disease')
plt.xlabel('BMI Category')
plt.ylabel('Count')
plt.show()


In [None]:
#Visualisation 3: Pie Chart to show the cardiovascular disease among the Age Groups

import matplotlib.pyplot as plt

# Lets count the frequency of cardiovascular disease (cardio) in each age group
cardio_counts = df.groupby('AgeGroup')['cardio'].value_counts().unstack()

# Plotting a pie chart for each age group with a smaller figure size
plt.figure(figsize=(6, 4)) 
colors = ['skyblue', 'lightcoral']
explode = (0, 0.1)  
for age_group in cardio_counts.index:
    plt.pie(cardio_counts.loc[age_group], labels=cardio_counts.columns, autopct='%1.1f%%', colors=colors, explode=explode)
    plt.title(f'Distribution of Cardiovascular Disease in {age_group} Age Group')
    plt.legend(title='Cardiovascular Disease', labels=['No', 'Yes'], loc='upper left', bbox_to_anchor=(1, 1))  # Adjust legend position
    plt.show()


In [None]:
#Visualisation 4: The Correlation Heatmap of Selected Variables 
import seaborn as sns
import matplotlib.pyplot as plt

# Select the variables for the correlation heatmap
selected_variables = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']

# To Calculate the correlation matrix
correlation_matrix = df[selected_variables].corr()

# Creating the correlation heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Selected Variables')
plt.show()

In [None]:
#Visualisation 5: Body Mass Index (BMI)(Histogram)
import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 6))
sns.histplot(df['BMI'], bins=20, kde=True, color='skyblue')
plt.title("BMI Distribution")
plt.xlabel("BMI")
plt.ylabel("Frequency")
plt.xlim(10, 50)
plt.show()

In [None]:
#Visualisation 6: Age Group Analysis (Donut Chart)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
# Calculate the values for the donut chart
age_group_counts = df['AgeGroup'].value_counts()
labels = age_group_counts.index
sizes = age_group_counts.values
colors = sns.color_palette('pastel')

# Creating the pie chart
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)

# Draw a white circle at the center to create a donut chart
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle
plt.title("Age Group Analysis (Donut Chart)")
plt.axis('equal')
plt.show()

In [None]:
#Visualisation 7: Gender Comparison (Violin plots)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.violinplot(data=df, x='gender', y='ap_hi', palette='bright')
plt.title("Gender Comparison")
plt.xlabel("Gender")
plt.ylabel("Systolic Blood Pressure (ap_hi)")
plt.show()

In [None]:
#Visualisation 8: Cardiovascular Disease (Line chart)
import seaborn as sns
import matplotlib.pyplot as plt

# Count the frequency of cardiovascular disease (cardio) in each cholesterol level
cardio_counts = df.groupby('cholesterol')['cardio'].value_counts().unstack()

# Plot lines for each cholesterol level
plt.figure(figsize=(8, 6))
for cholesterol_level in cardio_counts.index:
    plt.plot(cardio_counts.columns, cardio_counts.loc[cholesterol_level], label=f'Cholesterol {cholesterol_level}')

plt.title("Cardiovascular Disease Analysis by Cholesterol Level")
plt.xlabel("Cardiovascular Disease")
plt.ylabel("Count")
plt.xticks(cardio_counts.columns)
plt.legend(title='Cholesterol', loc='upper right')
plt.grid(True)
plt.show()