In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from collections import defaultdict




In [None]:
# Set the base directory for the dataset
base_dir = "path/to/your/dataset"

# Dictionary to store data information
data_info = defaultdict(list)

# Traverse the dataset
for patient_folder in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_folder)
    if os.path.isdir(patient_path):
        for label in ["0", "1"]:  # Directories for IDC-negative and IDC-positive
            label_path = os.path.join(patient_path, label)
            if os.path.exists(label_path):
                images = glob(os.path.join(label_path, "*.png"))
                data_info["patient_id"].extend([patient_folder] * len(images))
                data_info["label"].extend([int(label)] * len(images))
                data_info["file_path"].extend(images)

# Convert to a DataFrame
data_df = pd.DataFrame(data_info)

In [None]:
# Summary statistics
def summarize_data():
    print("Dataset Summary")
    print("Total images:", len(data_df))
    print(data_df["label"].value_counts(normalize=True) * 100)
    print("\nSample rows from the dataset:")
    print(data_df.head())

In [None]:
# Check for missing data
def check_missing():
    print("Missing Data Check")
    print(data_df.isnull().sum())

In [None]:
def plot_class_distribution():
    plt.figure(figsize=(8, 5))
    sns.countplot(x="label", data=data_df, palette="Set2")
    plt.title("Class Distribution (IDC-negative vs IDC-positive)")
    plt.xlabel("Label (0 = Healthy, 1 = IDC)")
    plt.ylabel("Number of Images")
    plt.show()

In [None]:
# Calculate percentage of IDC-positive images per patient
def calculate_patient_percentage():
    patient_stats = data_df.groupby("patient_id")["label"].value_counts(normalize=True).unstack(fill_value=0)
    patient_stats.columns = ["IDC-negative", "IDC-positive"]
    patient_stats["IDC-positive-percentage"] = patient_stats["IDC-positive"] * 100

    plt.figure(figsize=(12, 6))
    sns.boxplot(patient_stats["IDC-positive-percentage"], color="orange")
    plt.title("Percentage of IDC-positive Images per Patient")
    plt.xlabel("IDC-positive Percentage")
    plt.show()

    print(patient_stats.describe())
    return patient_stats


In [None]:
def analyze_coordinates():
    coord_data = data_df["file_path"].apply(lambda x: os.path.basename(x).split("_"))
    data_df["x_coord"] = coord_data.apply(lambda x: int(x[2]))
    data_df["y_coord"] = coord_data.apply(lambda x: int(x[3]))

    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x="x_coord", y="y_coord", hue="label",
        data=data_df.sample(10000), alpha=0.5, palette="coolwarm"
    )
    plt.title("Distribution of X-Y Coordinates by Label")
    plt.xlabel("X Coordinate")
    plt.ylabel("Y Coordinate")
    plt.legend(title="Label", loc="upper right")
    plt.show()

In [None]:
def show_sample_images():
    fig, axes = plt.subplots(4, 4, figsize=(12, 12))
    positive_samples = data_df[data_df["label"] == 1]["file_path"].sample(8).tolist()
    negative_samples = data_df[data_df["label"] == 0]["file_path"].sample(8).tolist()
    samples = positive_samples + negative_samples

    for i, ax in enumerate(axes.flat):
        img = plt.imread(samples[i])
        label = 1 if i < 8 else 0
        ax.imshow(img, cmap="gray")
        ax.set_title(f"Label: {label}")
        ax.axis("off")
    plt.tight_layout()
    plt.show()

In [None]:
if __name__ == "__main__":
    summarize_data()
    check_missing()
    plot_class_distribution()
    patient_stats = calculate_patient_percentage()
    analyze_coordinates()
    show_sample_images()
