## Datat Augmentation File

In [65]:
import cv2
import os
import albumentations as A
import numpy as np
import pandas as pd

In [66]:
train_file = pd.read_csv("train-metadata.csv",low_memory=False)
train_file.head()


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [67]:
pos_cancer_im = list(train_file.loc[train_file['target'] == 1]['isic_id'])
len(pos_cancer_im)

393

In [68]:
transform = A.Compose([
    A.HorizontalFlip(p=0.5),                      # Horizontal flipping
    A.RandomBrightnessContrast(p=0.2),            # Random brightness/contrast adjustment
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),  # Shifting, scaling, rotating
    A.Blur(blur_limit=3, p=0.1),                  # Blurring
    A.CLAHE(p=0.2),                               # CLAHE (Contrast Limited Adaptive Histogram Equalization)
    A.RandomGamma(gamma_limit=(80, 120), p=0.1),  # Random gamma adjustment
    A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),  # Adding Gaussian noise
])


In [69]:
input_dir = 'train-image\image'
output_dir = 'augment_pos'

In [70]:
pos_cancer_im[:5]

['ISIC_0082829',
 'ISIC_0096034',
 'ISIC_0104229',
 'ISIC_0119495',
 'ISIC_0157834']

In [71]:
img_path = os.path.join(input_dir, f'ISIC_0082829.jpg')
img_path

'train-image\\image\\ISIC_0082829.jpg'

In [72]:
n_augmentations = 20  # Number of augmented images to generate per original image
au_im_paths = []
au_pos_label = []
for img_name in pos_cancer_im:
    img_path = os.path.join(input_dir, f'{img_name}.jpg')
    image = cv2.imread(img_path)
    
    if image is None:
        continue
    
    for i in range(n_augmentations):
        # Apply augmentation
        augmented = transform(image=image)
        augmented_image = augmented['image']
        
        # Save the augmented image
        output_path = os.path.join(output_dir, f'aug_{i}_{img_name}.jpg')
        cv2.imwrite(output_path, augmented_image)
        # also saving the image path and label in an dataframe
        au_im_paths.append(f'aug_{i}_{img_name}')
        au_pos_label.append(1)



In [81]:
aug_data = pd.DataFrame({'isic_id':au_im_paths,'target':au_pos_label})


In [82]:
aug_data

Unnamed: 0,isic_id,target
0,aug_0_ISIC_0082829,1
1,aug_1_ISIC_0082829,1
2,aug_2_ISIC_0082829,1
3,aug_3_ISIC_0082829,1
4,aug_4_ISIC_0082829,1
...,...,...
7855,aug_15_ISIC_9996602,1
7856,aug_16_ISIC_9996602,1
7857,aug_17_ISIC_9996602,1
7858,aug_18_ISIC_9996602,1


now getting 7860 images for negative


In [74]:
import random
import shutil

In [75]:
negative_img_paths = random.sample(list(train_file.loc[train_file['target'] == 0]['isic_id']),7860) 

In [76]:
len(negative_img_paths)

7860

In [77]:
negative_img_label = [0] * len(negative_img_paths)
len(negative_img_label)

7860

In [78]:
for file_name in negative_img_paths:
    # Construct full file path
    source_path = os.path.join(input_dir, f'{file_name}.jpg')
    destination_path = os.path.join(output_dir, f'{file_name}.jpg')
    
    # Copy the file
    shutil.copy(source_path, destination_path)

print("Files copied successfully!")

Files copied successfully!


In [79]:
len(os.listdir('augment_pos'))

15720

In [80]:
pd2 = pd.DataFrame({'isic_id':negative_img_paths,'target':negative_img_label})
pd2

Unnamed: 0,isic_id,target
0,ISIC_7931446,0
1,ISIC_1968391,0
2,ISIC_0251481,0
3,ISIC_9678329,0
4,ISIC_5942580,0
...,...,...
7855,ISIC_3768468,0
7856,ISIC_3926720,0
7857,ISIC_6621891,0
7858,ISIC_5897469,0


In [83]:
cp = pd.concat((pd2,aug_data))
cp

Unnamed: 0,isic_id,target
0,ISIC_7931446,0
1,ISIC_1968391,0
2,ISIC_0251481,0
3,ISIC_9678329,0
4,ISIC_5942580,0
...,...,...
7855,aug_15_ISIC_9996602,1
7856,aug_16_ISIC_9996602,1
7857,aug_17_ISIC_9996602,1
7858,aug_18_ISIC_9996602,1


In [84]:
cp['target'].value_counts()

target
0    7860
1    7860
Name: count, dtype: int64

In [85]:
cp.to_csv("Aug_train.csv",index=False)