In [1]:
# This is used to increase the notebook's width to fill the screen, allowing for better plot visualization
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import json
import glob
import numpy as np
import pandas as pd

  from IPython.core.display import display, HTML


# JSON Generator 

In [6]:
def get_unique_classes(df, output_col):
    
    # Default order of class labels
    default_class_idxs = {"Normal": 0, "COVID-19": 1, "Pneumonia": 2}
    
    # Computes the unique classes in this dataset 
    unq = np.unique( df[output_col].to_list() )
    
    # Gets unique classes from df
    classes = { k: v for k, v in default_class_idxs.items() if k in unq }
    
    return classes

def get_num_samples(df, output_col):
    
    partition_list =  ["total", "train", "val", "test"]
    
    # Default order of class labels
    default_classes = ["Normal", "COVID-19", "Pneumonia"]
    
    # Output dict with number of samples per class for each partition
    num_samples_dict = {}
    
    # Iterates through partitions
    for part in partition_list:
        # Copies dataframe
        sub_df = df.copy(deep = True)
        
        # Removes columns unrelated to the current partition
        if part != "total":
            sub_df = sub_df[ sub_df["partition"] == part ]
            
        # Computes the unique classes in this dataset 
        # and how many examples there are for each class
        output_list = sub_df[output_col].to_list()
        unq, cts = np.unique(output_list, return_counts = True)
        
        # Reorganizes lists unq, cts as a single dict
        samples_per_class = {u: c for u,c in zip(unq, cts)}
        
        # Creates a dict w/ nÂ° of examples per class for the current partition
        samples_dict = { "Total": len(sub_df), 
                         "Normal": 0, 
                         "COVID-19": 0, 
                         "Pneumonia": 0 
                       }
        
        # Fills samples_dict with data from samples_per_class
        for clss in default_classes:
            if clss in samples_per_class.keys():
                samples_dict[clss] = int(samples_per_class[clss])
        
        # Updates num_samples_dict with data from samples_dict
        num_samples_dict[part] = samples_dict
    
    # Returns
    return num_samples_dict

def get_infos_from_csv(csv_path, input_col, output_col):
    
    # Gets dataset name from the CSV basename
    dataset_name = os.path.basename(csv_path).split("_data")[0].split(".")[0]
    
    # Loads metadata CSV file
    df = pd.read_csv( csv_path, sep = ";" )
    
    # Gets unique classes from df
    classes = get_unique_classes(df, output_col)
    
    num_samples = get_num_samples(df, output_col)
    
    attr_dict = { "name"       :  dataset_name,
                  "input_col"  :     input_col,
                  "output_col" :    output_col,
                  "num_classes":  len(classes),
                  "classes"    :       classes,
                  "num_samples":   num_samples,
                }
    
    return attr_dict

In [7]:
csv_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "CT", "classification" )
csv_path_list = glob.glob(os.path.join(csv_dir, "*.csv"))

for path in csv_path_list:
    dset_dict = get_infos_from_csv(path, input_col = "path", output_col = "class")
    
    for k, v in dset_dict.items():
        print(f"{k}: {v}")
        
    json_path = path.replace(".csv", ".json")

    # Saves the JSON file
    with open(json_path, "w") as json_file:
        json.dump( dset_dict, json_file, indent=4 )
    
    print("\n\n")

name: CNCB
input_col: path
output_col: class
num_classes: 3
classes: {'Normal': 0, 'COVID-19': 1, 'Pneumonia': 2}
num_samples: {'total': {'Total': 115837, 'Normal': 45758, 'COVID-19': 31070, 'Pneumonia': 39009}, 'train': {'Total': 70142, 'Normal': 27485, 'COVID-19': 19198, 'Pneumonia': 23459}, 'val': {'Total': 22833, 'Normal': 9139, 'COVID-19': 5919, 'Pneumonia': 7775}, 'test': {'Total': 22862, 'Normal': 9134, 'COVID-19': 5953, 'Pneumonia': 7775}}





  dset_dict = get_infos_from_csv(path, input_col = "path", output_col = "class")


name: Comp_CNCB_iCTCF
input_col: path
output_col: class
num_classes: 3
classes: {'Normal': 0, 'COVID-19': 1, 'Pneumonia': 2}
num_samples: {'total': {'Total': 88198, 'Normal': 22849, 'COVID-19': 45912, 'Pneumonia': 19437}, 'train': {'Total': 53520, 'Normal': 13741, 'COVID-19': 28116, 'Pneumonia': 11663}, 'val': {'Total': 17298, 'Normal': 4564, 'COVID-19': 8847, 'Pneumonia': 3887}, 'test': {'Total': 17380, 'Normal': 4544, 'COVID-19': 8949, 'Pneumonia': 3887}}



name: Comp_LIDC-SB
input_col: path
output_col: class
num_classes: 2
classes: {'Normal': 0, 'COVID-19': 1}
num_samples: {'total': {'Total': 18460, 'Normal': 3999, 'COVID-19': 14461, 'Pneumonia': 0}, 'train': {'Total': 11071, 'Normal': 2380, 'COVID-19': 8691, 'Pneumonia': 0}, 'val': {'Total': 3658, 'Normal': 778, 'COVID-19': 2880, 'Pneumonia': 0}, 'test': {'Total': 3731, 'Normal': 841, 'COVID-19': 2890, 'Pneumonia': 0}}



name: COVID-19-CT-Seg
input_col: path
output_col: class
num_classes: 1
classes: {'COVID-19': 1}
num_samples: {