# Exploratory Data Analysis on Heart Disease Dataset


In [None]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# 1. Data Loading and Initial Inspection
## Loading csv file

In [None]:
df=pd.read_csv('heart_disease.csv')
print(df)

## First few rows

In [None]:
print(df.head)


## Shape of the dataset

In [None]:
print(df.shape)


## Column names and data types

In [None]:
df.info()

In [None]:
df.describe()

### Dataset size
#### ->In the given dataset there are 1000 rows and 21 columns 
#### ->it has brief study of the habits, addictions or clinical conditions of the patients that suffered heart attacks like blood pressure ,diabetes etc
### Types of features 
#### -> It has both numerical as well as categorical data 
### Immediate Issues Observed
#### ->No null values observed


In [None]:
df.shape

# 3. Data Cleaning & Preprocessing
## a. Missing Values

In [None]:
df.isnull().sum()

In [None]:

(df.isnull().mean() * 100).sort_values(ascending=False)



### as the number of null values are less as compared to the size of rows that's why i have used mode for categorical data and median for numerical data
### 25 percent of number of rows in Alcohol consumption are null thus i filled it with unknow values


In [None]:

df['Alcohol Consumption'] = df['Alcohol Consumption'].fillna('Unknown')

df.isnull().sum().sum()

In [None]:
cat_cols = [
    'Diabetes','Sugar Consumption','High Blood Pressure',
    'High LDL Cholesterol','Low HDL Cholesterol',
    'Exercise Habits','Smoking','Stress Level',
    'Family Heart Disease','Gender'
]

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
num_cols = [
    'Age','Blood Pressure','Cholesterol Level','BMI','Sleep Hours',
    'Triglyceride Level','Fasting Blood Sugar','CRP Level','Homocysteine Level'
]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

df.isnull().sum()


### Duplicate rows can make the data bulky uselessly that's why it is important to reduce the data 

In [None]:
df.duplicated()

### Data type conversion is not needed as there are only two different data types float64 and str both are important and don't need conversion
### In order to do univariate and bivariate analysis data types should be of similar types 

In [None]:
df.dtypes

# 3. Univariate Analysis
## Histograms

In [None]:
plt.hist(df['Age'])

## Count plots

In [None]:
sns.countplot(df['Heart Disease Status'])

## Box plot

In [None]:
plt.boxplot(df['Blood Pressure'])

# 4. Bivariate & Multivariate Analysis
## Scatter plots

In [None]:
sns.scatterplot(x='Age', y='BMI', data=df)


## Bar plots

In [None]:
sns.barplot(x='Fasting Blood Sugar',y='Heart Disease Status',data=df)

## Correlation heatmaps 

In [None]:
corr=df.corr(numeric_only=True)
sns.heatmap(corr,annot=True,cmap='coolwarm')
plt.show()

# 5. Outlier Detection & Handling

In [None]:
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df['Age'] < lower) | (df['Age'] > upper)]
print(outliers)


In [None]:
z = np.abs(stats.zscore(df['Age']))
outliers = df[z > 3]
print(outliers)

# 6. Advanced / Niche Visualizations
## Box plot
### It summarizes the numerical data ,it helps in identifing outliers as compared to other plots

In [None]:
plt.boxplot(df['Fasting Blood Sugar'])

## Violin plot
### It helps in visualizing numerical data across categories ,it shows skewness and multiple peaks as compared to other plots

In [None]:
sns.violinplot(x='Heart Disease Status', y='Age', data=df)

# 7. Final dataset check
## The dataset did not change much after preprocessing. No rows were removed and no new columns were added , only null values were treated. The data was checked and confirmed to be clean and usable.

## Final shape of dataset

In [None]:
df.shape


# SUMMARY
## 1. The data set contains a mix of numercial and categorical features 
## 2. Most numerical values fall within realistic ranges , indicating dataset is free from anomalies
## 3. Diverse health profiles
## 4. All the informations provided help in doing analysis accurately , as it provides all the need information
## 5. The dataset tells us about lifestyle factors that help in identifing the habits that may lead to risky factors.
## 6. No significant outliers or duplicacy was there .
## 7. The target variable (Heart Disease Status) is well distributed, making the dataset suitable for further analysis or modeling.