In [23]:
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
from IPython.display import display
import matplotlib.pyplot as plt


def preprocess_image_for_dense_breast(
    image_path, 
    output_size=(1024, 1024),
    normalize_min=-1024,   
    normalize_max=3071,    
    target_mean=0,        
    target_std=1          
):

    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Could not load image {image_path}")

    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    img_clahe = clahe.apply(img)

    img_float = img_clahe.astype(np.float32)
    img_normalized = np.clip(img_float, normalize_min, normalize_max)

    img_normalized = (img_normalized - np.mean(img_normalized)) / (np.std(img_normalized) + 1e-7)

    if target_mean != 0 or target_std != 1:
        img_normalized = img_normalized * target_std + target_mean

    img_uint8 = ((img_normalized - img_normalized.min()) * 255 / 
                 (img_normalized.max() - img_normalized.min()))
    img_uint8 = img_uint8.astype(np.uint8)
    
    img_resized = cv2.resize(img_uint8, output_size, interpolation=cv2.INTER_AREA)
    
    return img_resized



def process_all_images(data_dir, output_dir, output_size=(1024, 1024), show_samples=True):
    os.makedirs(output_dir, exist_ok=True)

    image_files = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png', '.tiff')):
                image_files.append((root, file))

    for root, file in tqdm(image_files, desc="Processing images"):
        image_path = os.path.join(root, file)
        
        try:
            if show_samples:
                original = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            
            processed_image = preprocess_image_for_dense_breast(image_path, output_size)

            output_path = os.path.join(output_dir, os.path.relpath(image_path, data_dir))
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            cv2.imwrite(output_path, processed_image)
            
        except Exception as e:
            print(f"Failed to process {image_path}: {e}")

process_all_images(
    r'D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\images',
    r'D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\processed_images',
    output_size=(512, 512)
)

Processing images:   0%|          | 0/20000 [00:00<?, ?it/s]

In [11]:
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
from IPython.display import display
import matplotlib.pyplot as plt

def crop_breast_region(img):
    _, binary = cv2.threshold(img, 5, 255, cv2.THRESH_BINARY)
    
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)

        x, y, w, h = cv2.boundingRect(largest_contour)
        
        cropped = img[y:y+h, x:x+w]
        return cropped
    
    return img

def preprocess_image_for_dense_breast(
    image_path, 
    output_size=(1024, 1024),
    normalize_min=-1024,   
    normalize_max=3071,     
    target_mean=0,         
    target_std=1          
):

    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Could not load image {image_path}")

    img = crop_breast_region(img)

    img_float = img.astype(np.float32)
    img_normalized = np.clip(img_float, normalize_min, normalize_max)

    img_normalized = (img_normalized - np.mean(img_normalized)) / (np.std(img_normalized) + 1e-7)

    if target_mean != 0 or target_std != 1:
        img_normalized = img_normalized * target_std + target_mean

    img_uint8 = ((img_normalized - img_normalized.min()) * 255 / 
                 (img_normalized.max() - img_normalized.min()))
    img_uint8 = img_uint8.astype(np.uint8)

    img_resized = cv2.resize(img_uint8, output_size, interpolation=cv2.INTER_AREA)
    
    return img_resized

def process_all_images(data_dir, output_dir, output_size=(1024, 1024), show_samples=True):
    os.makedirs(output_dir, exist_ok=True)

    image_files = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png', '.tiff')):
                image_files.append((root, file))

    for root, file in tqdm(image_files, desc="Processing images"):
        image_path = os.path.join(root, file)
        
        try:
            if show_samples:
                original = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            
            processed_image = preprocess_image_for_dense_breast(image_path, output_size)
            
            output_path = os.path.join(output_dir, os.path.relpath(image_path, data_dir))
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            cv2.imwrite(output_path, processed_image)
            
            if show_samples:
                plt.figure(figsize=(12, 6))
                plt.subplot(1, 2, 1)
                plt.imshow(original, cmap='gray')
                plt.title('Original Image')
                plt.subplot(1, 2, 2)
                plt.imshow(processed_image, cmap='gray')
                plt.title('Processed Image')
                plt.show()
                
        except Exception as e:
            print(f"Failed to process {image_path}: {e}")

if __name__ == "__main__":
    process_all_images(
        r'D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\images',
        r'D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\processed_images',
        output_size=(512, 512),
        show_samples=False
    )

Processing images:   0%|          | 0/2744 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os

def create_kfold_splits(csv_path, n_splits=5, random_state=42):
    """
    Create k-fold splits for the dataset while maintaining class distribution
    """
    df = pd.read_csv(csv_path)
    
    # mapping
    label_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    df['Label_num'] = df['Label'].map(label_map)
    
    # define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # add fold
    df['fold'] = -1
    
    # split on label for class distribution
    for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label_num'])):
        df.loc[val_idx, 'fold'] = fold
    
    # save fold
    base_path = os.path.dirname(csv_path)
    filename = os.path.basename(csv_path).split('.')[0]
    
    for fold in range(n_splits):
        # train data for current fold
        train_df = df[df['fold'] != fold].copy()
        # validation data for current fold
        val_df = df[df['fold'] == fold].copy()
        
        # save files as patient, label
        train_df[['Patient', 'Label']].to_csv(
            os.path.join(base_path, f'{filename}_fold{fold}_train.csv'), 
            index=False
        )
        val_df[['Patient', 'Label']].to_csv(
            os.path.join(base_path, f'{filename}_fold{fold}_val.csv'), 
            index=False
        )
        
        # print class distribution for each fold
        print(f"\nFold {fold} distributions:")
        print("\nTrain set distribution:")
        print(train_df['Label'].value_counts(normalize=True))
        print("\nValidation set distribution:")
        print(val_df['Label'].value_counts(normalize=True))
    
    # save origin file with fold
    df.to_csv(os.path.join(base_path, f'{filename}_with_folds.csv'), index=False)
    
    return df

if __name__ == "__main__":
    csv_path = 'datasets/ThongNhat/ThongNhat_labels.csv'
    create_kfold_splits(csv_path, n_splits=5)



Fold 0 distributions:

Train set distribution:
Label
C    0.433453
D    0.338129
B    0.196043
A    0.032374
Name: proportion, dtype: float64

Validation set distribution:
Label
C    0.435714
D    0.335714
B    0.192857
A    0.035714
Name: proportion, dtype: float64

Fold 1 distributions:

Train set distribution:
Label
C    0.432675
D    0.337522
B    0.195691
A    0.034111
Name: proportion, dtype: float64

Validation set distribution:
Label
C    0.438849
D    0.338129
B    0.194245
A    0.028777
Name: proportion, dtype: float64

Fold 2 distributions:

Train set distribution:
Label
C    0.434470
D    0.337522
B    0.193896
A    0.034111
Name: proportion, dtype: float64

Validation set distribution:
Label
C    0.431655
D    0.338129
B    0.201439
A    0.028777
Name: proportion, dtype: float64

Fold 3 distributions:

Train set distribution:
Label
C    0.434470
D    0.337522
B    0.195691
A    0.032316
Name: proportion, dtype: float64

Validation set distribution:
Label
C    0.431655
D  

In [1]:
import pandas as pd
import os

def extract_mammogram_info(input_csv_path, output_csv_path):
    """
    Extract specific columns from mammogram annotations file and save to new CSV.
    
    Args:
        input_csv_path (str): Path to input CSV file
        output_csv_path (str): Path to save output CSV file
    """
    df = pd.read_csv(input_csv_path)
    
    selected_columns = [
        'study_id',
        'image_id',
        'laterality',
        'view_position',
        'breast_density'
    ]
    
    df_selected = df[selected_columns]
    
    df_selected['breast_density'] = df_selected['breast_density'].str.replace('DENSITY ', '')
    density_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    df_selected['breast_density'] = df_selected['breast_density'].map(density_mapping)
    
    
    df_selected.to_csv(output_csv_path, index=False)
    
    print(f"Processed {len(df_selected)} records")
    print(f"Saved to: {output_csv_path}")

    print("\nValue counts for breast density:")
    print(df_selected['breast_density'].value_counts())
    print("\nValue counts for view position:")
    print(df_selected['view_position'].value_counts())
    print("\nValue counts for laterality:")
    print(df_selected['laterality'].value_counts())

if __name__ == "__main__":
    input_path = r'D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\breast-level_annotations.csv'
    output_path = r'D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\mammogram_metadata.csv'
    
    extract_mammogram_info(input_path, output_path)

Processed 20000 records
Saved to: D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\mammogram_metadata.csv

Value counts for breast density:
breast_density
2    15292
3     2700
1     1908
0      100
Name: count, dtype: int64

Value counts for view position:
view_position
CC     10001
MLO     9999
Name: count, dtype: int64

Value counts for laterality:
laterality
L    10000
R    10000
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['breast_density'] = df_selected['breast_density'].str.replace('DENSITY ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['breast_density'] = df_selected['breast_density'].map(density_mapping)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from collections import Counter

def split_dataset(csv_path, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2, random_state=42):
    df = pd.read_csv(csv_path)
    
    print("Original density distribution:")
    original_dist = df['breast_density'].value_counts(normalize=True)
    print(original_dist)
    
    study_density = df.groupby('study_id')['breast_density'].first().reset_index()
    
    train_splitter = GroupShuffleSplit(n_splits=1, train_size=train_ratio, random_state=random_state)
    train_idx, temp_idx = next(train_splitter.split(study_density, groups=study_density['study_id']))
    
    train_studies = study_density.iloc[train_idx]['study_id']
    temp_studies = study_density.iloc[temp_idx]['study_id']
    
    val_ratio_adjusted = val_ratio / (1 - train_ratio)
    val_splitter = GroupShuffleSplit(n_splits=1, train_size=val_ratio_adjusted, random_state=random_state)
    val_idx, test_idx = next(val_splitter.split(
        study_density.iloc[temp_idx], 
        groups=study_density.iloc[temp_idx]['study_id']
    ))
    
    val_studies = temp_studies.iloc[val_idx]
    test_studies = temp_studies.iloc[test_idx]
    
    train_df = df[df['study_id'].isin(train_studies)]
    val_df = df[df['study_id'].isin(val_studies)]
    test_df = df[df['study_id'].isin(test_studies)]
    
    print("\nDataset split statistics:")
    print(f"Total studies: {len(df['study_id'].unique())}")
    print(f"Train studies: {len(train_studies)} ({len(train_df)} images)")
    print(f"Val studies: {len(val_studies)} ({len(val_df)} images)")
    print(f"Test studies: {len(test_studies)} ({len(test_df)} images)")
    
    print("\nDensity distribution in splits:")
    print("\nTrain:")
    print(train_df['breast_density'].value_counts(normalize=True))
    print("\nValidation:")
    print(val_df['breast_density'].value_counts(normalize=True))
    print("\nTest:")
    print(test_df['breast_density'].value_counts(normalize=True))
    
    base_path = csv_path.rsplit('.', 1)[0]
    train_df.to_csv(f"{base_path}_train.csv", index=False)
    val_df.to_csv(f"{base_path}_val.csv", index=False)
    test_df.to_csv(f"{base_path}_test.csv", index=False)
    
    print("\nFiles saved:")
    print(f"{base_path}_train.csv")
    print(f"{base_path}_val.csv")
    print(f"{base_path}_test.csv")
    
    return train_df, val_df, test_df

if __name__ == "__main__":
    csv_path = r'D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\ThongNhat_labels.csv'
    train_df, val_df, test_df = split_dataset(csv_path)

Original density distribution:


KeyError: 'breast_density'

In [2]:
def split_dataset(csv_path, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2, random_state=42):
    df = pd.read_csv(csv_path)
    
    print("Original density distribution:")
    original_dist = df['Label'].value_counts(normalize=True)
    print(original_dist)
    
    train_splitter = GroupShuffleSplit(n_splits=1, train_size=train_ratio, random_state=random_state)
    train_idx, temp_idx = next(train_splitter.split(df, groups=df['Patient']))
    
    train_df = df.iloc[train_idx]
    temp_df = df.iloc[temp_idx]
    
    val_ratio_adjusted = val_ratio / (1 - train_ratio)
    val_splitter = GroupShuffleSplit(n_splits=1, train_size=val_ratio_adjusted, random_state=random_state)
    val_idx, test_idx = next(val_splitter.split(temp_df, groups=temp_df['Patient']))
    
    val_df = temp_df.iloc[val_idx]
    test_df = temp_df.iloc[test_idx]

    print("\nDataset split statistics:")
    print(f"Total patients: {len(df['Patient'].unique())}")
    print(f"Train patients: {len(train_df['Patient'].unique())} ({len(train_df)} images)")
    print(f"Val patients: {len(val_df['Patient'].unique())} ({len(val_df)} images)")
    print(f"Test patients: {len(test_df['Patient'].unique())} ({len(test_df)} images)")
    
    print("\nDensity distribution in splits:")
    print("\nTrain:")
    print(train_df['Label'].value_counts(normalize=True))
    print("\nValidation:")
    print(val_df['Label'].value_counts(normalize=True))
    print("\nTest:")
    print(test_df['Label'].value_counts(normalize=True))
    
    base_path = csv_path.rsplit('.', 1)[0]
    train_df.to_csv(f"{base_path}_train.csv", index=False)
    val_df.to_csv(f"{base_path}_val.csv", index=False)
    test_df.to_csv(f"{base_path}_test.csv", index=False)
    
    print("\nFiles saved:")
    print(f"{base_path}_train.csv")
    print(f"{base_path}_val.csv")
    print(f"{base_path}_test.csv")
    
    return train_df, val_df, test_df

if __name__ == "__main__":
    csv_path = r'D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\ThongNhat_labels.csv'
    train_df, val_df, test_df = split_dataset(csv_path)

Original density distribution:
Label
C    0.625551
B    0.196769
D    0.177680
Name: proportion, dtype: float64

Dataset split statistics:
Total patients: 641
Train patients: 384 (404 images)
Val patients: 128 (139 images)
Test patients: 129 (138 images)

Density distribution in splits:

Train:
Label
C    0.633663
B    0.198020
D    0.168317
Name: proportion, dtype: float64

Validation:
Label
C    0.625899
D    0.187050
B    0.187050
Name: proportion, dtype: float64

Test:
Label
C    0.601449
B    0.202899
D    0.195652
Name: proportion, dtype: float64

Files saved:
D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\ThongNhat_labels_train.csv
D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\ThongNhat_labels_val.csv
D:\RESEARCH\MV-DEFEAT\datasets\ThongNhat\ThongNhat_labels_test.csv


In [2]:
import pandas as pd

def weighted_split_dataset(csv_path, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2, random_state=42):
    """
    Split dataset using weighted random sampling to handle class imbalance
    """
    df = pd.read_csv(csv_path)
    
    class_counts = df['breast_density'].value_counts()
    class_weights = 1 / class_counts
    class_weights = class_weights / class_weights.sum()  
    
    study_density = df.groupby('study_id')['breast_density'].first().reset_index()
    study_weights = study_density['breast_density'].map(class_weights)
    
    train_studies = study_density['study_id'].sample(
        n=int(len(study_density) * train_ratio),
        weights=study_weights,
        random_state=random_state
    )
    
    remaining_studies = study_density[~study_density['study_id'].isin(train_studies)]
    remaining_weights = remaining_studies['breast_density'].map(class_weights)
    
    val_ratio_adjusted = val_ratio / (1 - train_ratio)
    val_size = int(len(remaining_studies) * val_ratio_adjusted)
    
    val_studies = remaining_studies['study_id'].sample(
        n=val_size,
        weights=remaining_weights,
        random_state=random_state
    )
    
    test_studies = remaining_studies[~remaining_studies['study_id'].isin(val_studies)]['study_id']
    
    train_df = df[df['study_id'].isin(train_studies)]
    val_df = df[df['study_id'].isin(val_studies)]
    test_df = df[df['study_id'].isin(test_studies)]
    
    print("\nWeighted sampling distribution:")
    print("\nTrain distribution:")
    print(train_df['breast_density'].value_counts(normalize=True))
    print("\nValidation distribution:")
    print(val_df['breast_density'].value_counts(normalize=True))
    print("\nTest distribution:")
    print(test_df['breast_density'].value_counts(normalize=True))

    base_path = csv_path.rsplit('.', 1)[0]
    train_df.to_csv(f"{base_path}_weighted_train.csv", index=False)
    val_df.to_csv(f"{base_path}_weighted_val.csv", index=False)
    test_df.to_csv(f"{base_path}_weighted_test.csv", index=False)
    
    return train_df, val_df, test_df

weighted_split_dataset(r'D:\RESEARCH\MV-DEFEAT\datasets\VinDr-mammo\breast-level_annotations.csv')


Weighted sampling distribution:

Train distribution:
breast_density
DENSITY C    0.614500
DENSITY D    0.220000
DENSITY B    0.157167
DENSITY A    0.008333
Name: proportion, dtype: float64

Validation distribution:
breast_density
DENSITY C    0.9815
DENSITY D    0.0135
DENSITY B    0.0050
Name: proportion, dtype: float64

Test distribution:
breast_density
DENSITY C    0.9980
DENSITY D    0.0015
DENSITY B    0.0005
Name: proportion, dtype: float64


(                               study_id                         series_id  \
 16     ac4975eb788af8b7e15cafca9ac9a1c9  bc183447730d58709da1af503d7c469c   
 17     ac4975eb788af8b7e15cafca9ac9a1c9  bc183447730d58709da1af503d7c469c   
 18     ac4975eb788af8b7e15cafca9ac9a1c9  bc183447730d58709da1af503d7c469c   
 19     ac4975eb788af8b7e15cafca9ac9a1c9  bc183447730d58709da1af503d7c469c   
 20     87f322198db11b86e20ad96ea29eb010  3e63436aedec442a7b3bafd0158cfce1   
 ...                                 ...                               ...   
 19991  8db1b8ba11d6d804141f1fa4cf91b614  e26615e35922f3b06ca09a742de0ed8b   
 19996  b3c8969cd2accfa4dbb2aece1f7158ab  69d7f07ea04572dad5e5aa62fbcfc4b7   
 19997  b3c8969cd2accfa4dbb2aece1f7158ab  69d7f07ea04572dad5e5aa62fbcfc4b7   
 19998  b3c8969cd2accfa4dbb2aece1f7158ab  69d7f07ea04572dad5e5aa62fbcfc4b7   
 19999  b3c8969cd2accfa4dbb2aece1f7158ab  69d7f07ea04572dad5e5aa62fbcfc4b7   
 
                                image_id laterality view_posit

In [5]:
import cv2
import numpy as np

def crop_breast_region(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    _, binary = cv2.threshold(gray, 5, 255, cv2.THRESH_BINARY)
    
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if contours:
        largest_contour = max(contours, key=cv2.contourArea)

        x, y, w, h = cv2.boundingRect(largest_contour)

        cropped = img[y:y+h, x:x+w]
        
        return cropped
    
    return None

image_path = r"datasets\patients_images\new_data\VUONG_THI_DUA 13064504\Left - CC.jpg"
cropped_image = crop_breast_region(image_path)

if cropped_image is not None:
    cv2.imwrite('cropped_breast.jpg', cropped_image)