In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('heart.csv')  # Ensure the dataset is in the working directory

# Data Cleaning
df.dropna(inplace=True)  # Remove rows with missing data

# Statistical Summaries
summary_stats = df.describe()
correlation_matrix = df.corr()
median_age_target = df.groupby('target')['age'].median()

# Print the results of describe, corr, and median_age_target
print("Summary Statistics:\n", summary_stats)
print("\nCorrelation Matrix:\n", correlation_matrix)
print("\nMedian Age by Heart Disease Target:\n", median_age_target)

# Relational Graph: Line Graph
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='age', y='trestbps', hue='target')
plt.title('Resting Blood Pressure Over Age by Heart Disease')
plt.xlabel('Age')
plt.ylabel('Resting Blood Pressure')
plt.legend(title='Heart Disease', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('line_graph.png')  # Save the figure
plt.close()

# Categorical Graph: Bar Chart
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='sex', y='chol', hue='target')
plt.title('Cholesterol Levels by Sex and Heart Disease')
plt.xlabel('Sex')
plt.ylabel('Cholesterol')
plt.legend(title='Heart Disease', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('bar_chart.png')  # Save the figure
plt.close()

# Statistical Graph: Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Heart Disease Variables')
plt.tight_layout()
plt.savefig('heatmap.png')  # Save the figure
plt.close()

# Additional Graph: Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='thal', y='age', hue='target')
plt.title('Box Plot of Age by Thalassemia and Heart Disease')
plt.xlabel('Thalassemia')
plt.ylabel('Age')
plt.legend(title='Heart Disease', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('box_plot.png')  # Save the figure
plt.close()

# Save the statistical summaries to text files
summary_stats.to_csv('summary_stats.csv')
correlation_matrix.to_csv('correlation_matrix.csv')
median_age_target.to_csv('median_age_target.csv')


Summary Statistics:
                age          sex           cp     trestbps        chol  \
count  1025.000000  1025.000000  1025.000000  1025.000000  1025.00000   
mean     54.434146     0.695610     0.942439   131.611707   246.00000   
std       9.072290     0.460373     1.029641    17.516718    51.59251   
min      29.000000     0.000000     0.000000    94.000000   126.00000   
25%      48.000000     0.000000     0.000000   120.000000   211.00000   
50%      56.000000     1.000000     1.000000   130.000000   240.00000   
75%      61.000000     1.000000     2.000000   140.000000   275.00000   
max      77.000000     1.000000     3.000000   200.000000   564.00000   

               fbs      restecg      thalach        exang      oldpeak  \
count  1025.000000  1025.000000  1025.000000  1025.000000  1025.000000   
mean      0.149268     0.529756   149.114146     0.336585     1.071512   
std       0.356527     0.527878    23.005724     0.472772     1.175053   
min       0.000000     0.