# Generating New CSV Files to Augment our dataset sizes

In [1]:
import pandas as pd
import os

dataset = "Dataset_2017"

## ISIC 2017 dataset class distribtuion:

In [3]:
img_labels = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_1.csv")))

#get number of each class
counts = img_labels['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Old Ratio:",num_malig/(num_benign))

374
1626
Old Ratio: 0.23001230012300122


## Data Augmentation for Model 2:
Here we apply a transformation to 90% of the malignant images by storing a marker in the csv to indicate it should be transformed

In [None]:
new_img_labels_2 = img_labels.copy(deep=True)
new_img_labels_2["transform"] = 0
i=0
while i < num_malig*0.9:
    # Randomly select a single element
    random_row = img_labels.sample(n=1)  # Randomly select one row
    if(random_row.iloc[0][1] == 1.0):  
        random_row["transform"] = 1
        new_img_labels_2 = pd.concat([new_img_labels_2,random_row],ignore_index=True)
        i+=1
print("New Distribution:")
counts = new_img_labels_2['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Ratio:",num_malig/(num_benign))
new_img_labels_2.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_2.csv")), index=False)

          image_id  melanoma  seborrheic_keratosis  transform
0     ISIC_0000000       0.0                   0.0          0
1     ISIC_0000001       0.0                   0.0          0
2     ISIC_0000002       1.0                   0.0          0
3     ISIC_0000003       0.0                   0.0          0
4     ISIC_0000004       1.0                   0.0          0
...            ...       ...                   ...        ...
2332  ISIC_0001148       1.0                   0.0          1
2333  ISIC_0013359       1.0                   0.0          1
2334  ISIC_0013975       1.0                   0.0          1
2335  ISIC_0014092       1.0                   0.0          1
2336  ISIC_0000300       1.0                   0.0          1

[2337 rows x 4 columns]
New Distribution:
711
1626
Ratio: 0.43726937269372695


## Data Augmentation 3
Here we reduce the number of benign images until the ratio is 0.463

In [None]:

new_img_labels_3 = new_img_labels_2.copy(deep=True)

counts = new_img_labels_2['melanoma'].value_counts()
ratio = counts[1.0]/counts[0.0]
i=0
while ratio <= 0.463:
    # Randomly select a single element
    random_row = new_img_labels_3.sample(n=1)  # Randomly select one row
    if(random_row.iloc[0][1] == 0.0):  
        new_img_labels_3 = new_img_labels_3.drop(index=random_row.index.tolist()[0])
        counts = new_img_labels_3['melanoma'].value_counts()
        ratio = counts[1.0]/counts[0.0]
        i+=1
print("New Distribution:")
counts = new_img_labels_3['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Ratio:",num_malig/(num_benign))
new_img_labels_3.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_3.csv")), index=False)

          image_id  melanoma  seborrheic_keratosis  transform
0     ISIC_0000000       0.0                   0.0          0
1     ISIC_0000001       0.0                   0.0          0
2     ISIC_0000002       1.0                   0.0          0
3     ISIC_0000003       0.0                   0.0          0
4     ISIC_0000004       1.0                   0.0          0
...            ...       ...                   ...        ...
2332  ISIC_0001148       1.0                   0.0          1
2333  ISIC_0013359       1.0                   0.0          1
2334  ISIC_0013975       1.0                   0.0          1
2335  ISIC_0014092       1.0                   0.0          1
2336  ISIC_0000300       1.0                   0.0          1

[2246 rows x 4 columns]
New Distribution:
711
1535
Ratio: 0.4631921824104234


### Data Augmentation 4
Here we transform the malignant images and reduce the benign images until we have a ratio of 1.0

In [5]:
new_img_labels_4 = img_labels.copy(deep=True)
new_img_labels_4["transform"] = 0
i=0
while i < num_malig*1.0:
    # Randomly select a single element
    random_row = img_labels.sample(n=1)  # Randomly select one row
    if(random_row.iloc[0][1] == 1.0):  
        random_row["transform"] = 1
        new_img_labels_4 = pd.concat([new_img_labels_4,random_row],ignore_index=True)
        i+=1
counts = new_img_labels_4['melanoma'].value_counts()
ratio = counts[1.0]/counts[0.0]
i=0
while ratio <= 1.0:
    # Randomly select a single element
    random_row = new_img_labels_4.sample(n=1)  # Randomly select one row
    if(random_row.iloc[0][1] == 0.0):  
        new_img_labels_4 = new_img_labels_4.drop(index=random_row.index.tolist()[0])
        counts = new_img_labels_4['melanoma'].value_counts()
        ratio = counts[1.0]/counts[0.0]
        i+=1


print("New Distribution:")
counts = new_img_labels_4['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Ratio:",num_malig/(num_benign))
new_img_labels_4.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_4.csv")), index=False)

New Distribution:
1085
1084
Ratio: 1.0009225092250922


# Data Manipluation 5
Pulling malignant images from the [PH2 Dataset](https://github.com/vikaschouhan/PH2-dataset):

In [5]:
new_img_labels_5 = img_labels.copy(deep=True)
dataset = "Dataset_2017_ph2"
ph2 = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "PH2_simple_dataset.csv")))
#bring labels in line with out dataset
ph2 = ph2.rename(columns={'image_name': 'image_id', 'diagnosis': 'melanoma'})

for index,row, in ph2.iterrows():
    #only copy melanoma samples
    if(row.iloc[1] == "Melanoma"):  
        #rename melanoma to true
        row.iloc[1] = 1.0
        row.iloc[1] = 1.0
        new_img_labels_5 = pd.concat([new_img_labels_5,row.to_frame().T],ignore_index=True)
new_img_labels_5["seborrheic_keratosis"] = 0.0
new_img_labels_5["transform"] = 0
print("New Distribution:")
counts = new_img_labels_5['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Ratio:",num_malig/(num_benign))
new_img_labels_5.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_5.csv")), index=False)

New Distribution:
414
1626
Ratio: 0.25461254612546125


# Data Manipulation 7
Here I am just cleaning up the 2018 dataset csv's. Can only be run once

In [None]:
#can only be ran once
dataset = "Dataset_2018"
train = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_1.csv")))
new_train = train.drop(['NV','BCC','AKIEC','BKL','DF','VASC'], axis=1)
new_train.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_5.csv")), index=False)
test = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Test_GroundTruth_1.csv")))
new_test = test.drop(['NV','BCC','AKIEC','BKL','DF','VASC'], axis=1)
new_test = new_test.rename(columns={'image': 'image_id', 'MEL': 'melanoma'})
new_test.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Test_GroundTruth_1.csv")), index=False)

val = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Validation_GroundTruth_1.csv")))
new_val = val.drop(['NV','BCC','AKIEC','BKL','DF','VASC'], axis=1)
new_val = new_val.rename(columns={'image': 'image_id', 'MEL': 'melanoma'})
new_val.to_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Validation_GroundTruth_1.csv")), index=False)


### Data distribution of the 2018 Dataset

In [2]:
dataset = "Dataset_2018"
train = pd.read_csv(os.path.join(dataset,os.path.join("Ground_Truths", "Training_GroundTruth_1.csv")))
print("New Distribution:")
counts = train['melanoma'].value_counts()
num_malig  = counts[1.0]
num_benign = counts[0.0]
print(num_malig)
print(num_benign)
print("Ratio:",num_malig/(num_benign))

New Distribution:
1113
8902
Ratio: 0.12502808357672432
