In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image as PilImage

from pathlib import Path 
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

# Load the data from the specified local path
data_path = DATA_DIR / "JM.xlsx"
data = pd.read_excel(data_path)

# Display the first few rows of the dataframe
print(data.head())

# Display the shape of the dataframe
print(data.shape)

# Display information about the dataframe
data.info()

# Determine the number of subplots based on the number of columns in the DataFrame
num_columns = len(data.columns)
num_rows = (num_columns // 4) + 1 if num_columns % 4 != 0 else num_columns // 4  # Adjust rows based on the number of columns

# Set up subplots
fig, axes = plt.subplots(num_rows, 4, figsize=(15, 10), dpi=600)  # Adjust dpi to 600
axes = axes.flatten()

# Define colors
hist_color = '#1f77b4'  # 柳叶刀常用的蓝色

# Iterate over each column and plot histogram
for i, column in enumerate(data.columns):
    if i < len(axes):  # To avoid index errors if there are fewer axes than columns
        ax = axes[i]
        ax.hist(data[column], bins=20, color=hist_color, edgecolor='black')
        ax.set_title(column)
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()

# Set path and save histogram figure as an initial file (without CMYK conversion)
output_dir = r"C:\Users\Michael Wang\OneDrive\output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Temporary file for saving the figure before CMYK conversion
histogram_temp_path = os.path.join(output_dir, "Cardiovascular_Disease_Dataset_histograms_temp.tiff")
fig.savefig(histogram_temp_path, dpi=600, format='tiff')

# Convert to CMYK using PIL
with PilImage.open(histogram_temp_path) as img:
    cmyk_image = img.convert('CMYK')
    histogram_final_path = os.path.join(output_dir, "Cardiovascular_Disease_Dataset_histograms_final.tiff")
    cmyk_image.save(histogram_final_path)

# 确保文件已经关闭后再删除临时文件
img.close()  # 明确关闭文件
os.remove(histogram_temp_path)

# Display correlation matrix
corr_matrix = data.corr()

# Create a heatmap
plt.figure(figsize=(10, 8), dpi=600)  # Adjust dpi to 600
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')

# Temporary file for saving the heatmap figure before CMYK conversion
heatmap_temp_path = os.path.join(output_dir, "Cardiovascular_Disease_Dataset_correlation_matrix_temp.tiff")
plt.savefig(heatmap_temp_path, dpi=600, format='tiff')

# Convert heatmap to CMYK using PIL
with PilImage.open(heatmap_temp_path) as img:
    cmyk_image = img.convert('CMYK')
    heatmap_final_path = os.path.join(output_dir, "Cardiovascular_Disease_Dataset_correlation_matrix_final.tiff")
    cmyk_image.save(heatmap_final_path)

# 确保文件已经关闭后再删除临时文件
img.close()  # 明确关闭文件
os.remove(heatmap_temp_path)

# Show plots
plt.show()


In [None]:
# ================== 统一保存路径 ==================
save_dir = OUTPUT_DIR
os.makedirs(save_dir, exist_ok=True)

# -------------------------------------------------
# 1. 保存直方图
# -------------------------------------------------
histogram_png = os.path.join(save_dir, "Cardiovascular_Disease_Dataset_histograms.png")
fig.savefig(histogram_png, dpi=300, format='png', bbox_inches='tight')

# -------------------------------------------------
# 2. 保存相关系数热力图
# -------------------------------------------------
heatmap_png = os.path.join(save_dir, "Cardiovascular_Disease_Dataset_correlation_matrix.png")
plt.figure(fig.number)      # 确保当前激活的是热力图 figure
plt.savefig(heatmap_png, dpi=300, format='png', bbox_inches='tight')

print(f"\n图片已保存至：\n{histogram_png}\n{heatmap_png}")