In [1]:
# This is used to increase the notebook's width to fill the screen, allowing for better plot visualization
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import numpy as np
import pandas as pd

  from IPython.core.display import display, HTML


# Path to Dataset and the Mapping from Labels to Findings

In [2]:
# Relative path to dataset
data_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "Tomografia", "COVIDx CT-3A" )
assert os.path.exists( data_dir ), "Unable to find the relative path to COVIDx CT-2A, please check data_dir..."

# Dict associating class labels to classes
class_dict = { 0: "Normal", 1: "Pneumonia", 2: "COVID-19" }

# Extract data from TXTs

In [3]:
def extract_from_csvs(import_dir, class_mapping, save_df = False):
    
    # Dict used to store the extracted metadata
    metadata_dict = { "Filename":  [],      # Filename of the images of the dataset
                      "Partition": [],      # Original partition of the sample
                      "Class":     [],      # Class of the sample
                      "x_min":     [],      # Smallest x value to crop the sample
                      "y_min":     [],      # Smallest y value to crop the sample
                      "x_max":     [],      # Largest  x value to crop the sample
                      "y_max":     [] }     # Largest  x value to crop the sample

    # Iterates between train, validation and test partitions
    for partition in ["train", "val", "test"]:

        # Path to this partition's text file
        txt_path = os.path.join( import_dir, "{}_COVIDx_CT-3A.txt".format(partition) )
        assert os.path.exists( txt_path ), "Unable to find '{}', please check the files in '{}'...".format(txt_path, 
                                                                                                           import_dir)
        
        # Opens the text file and read its lines
        with open(txt_path) as file:
            lines = file.readlines()
            
            # Iterates through the file's lines and extracts metadata regarding the images of the dataset
            for idx, line in enumerate(lines):

                # Breaks the current line at spaces to split the metadata
                filename, class_idx, xmin, ymin, xmax, ymax = line.split(" ")

                # Adds the metadata to the corresponding list inside metadata_dict
                metadata_dict["Filename"].append( filename )
                metadata_dict["Partition"].append( partition )
                metadata_dict["Class"].append( class_mapping[int(class_idx)] )
                metadata_dict["x_min"].append( int(xmin) )
                metadata_dict["y_min"].append( int(ymin) )
                metadata_dict["x_max"].append( int(xmax) )
                metadata_dict["y_max"].append( int(ymax) )

                # Prints the progress
                if (idx % 1000 == 0) or (idx+1 == len(lines)):
                    print("\tLine {}/{}".format(str(idx+1).zfill(6), len(lines)), end = "\r")

    # Generates a dataframe from the metadata dictionary
    df = pd.DataFrame.from_dict( metadata_dict )
    
    # Saves the dataframe as a csv if requested
    if save_df:
        # Generates a path to the new csv file
        path_to_csv = os.path.join( import_dir, "temp_data", "1_metadata_from_txts.csv")
        
        # Creates a new directory if it doesn't exist
        if not os.path.exists( os.path.dirname(path_to_csv) ):
            os.makedirs( os.path.dirname(path_to_csv) )
        
        # Saves the dataframe
        df.to_csv( path_to_csv, index = False, sep = ";" )
    
    return df

txt_data_df = extract_from_csvs(data_dir, class_dict, save_df = False)
print(f"{len(txt_data_df)} lines in total..."+30*" ")
txt_data_df.head()

425024 lines in total...                              


Unnamed: 0,Filename,Partition,Class,x_min,y_min,x_max,y_max
0,NCP_96_1328_0032.png,train,COVID-19,9,94,512,405
1,NCP_96_1328_0035.png,train,COVID-19,10,106,512,405
2,NCP_96_1328_0036.png,train,COVID-19,10,105,512,406
3,NCP_96_1328_0037.png,train,COVID-19,11,104,512,406
4,NCP_96_1328_0038.png,train,COVID-19,11,103,512,406


# Combine TXT & CSV Data

In [4]:
def get_csv_df(import_dir):
    
    # Path to metadata csv
    metadata_csv_path = os.path.join( import_dir, "metadata.csv" )

    # Reads metadata csv as a dataframe
    dst_df  = pd.read_csv(metadata_csv_path, sep = ",")

    # Filters out rows regarding MosMedData as its samples are not in the COVIDx CT-2A dataset as they are not verified
    dst_df = dst_df[ dst_df["source"] != "MosMedData" ].reset_index(drop=True)
    
    # Sets patient id as the index of the dataframe for an easy look-up later on
    dst_df = dst_df.set_index( keys = "patient id", drop = False )
    
    return dst_df

def match_fname2pid( filename, ids ):
    # Iterates through the list of patient ids
    for id in ids:
        
        # Checks for patient_id in the slice's filename, most filenames begin with "<patient_id>_"
        if (id+"_" in filename):
            return id
        
        # However, some filenames begin with "<patient_id>-"
        if (id+"-" in filename):
            return id
    
    # As all filenames should match some patient_id, no errors should happen
    assert False, "Couldn't match file '{}' to any patient_id...".format(filename)
    return 
    
def combine_dfs(import_dir, txt_df, save_df = False):
    
    # Reads metadata csv as a dataframe
    csv_df = get_csv_df(import_dir)
    
    # Lists all filenames and all patient ids
    filenames   = txt_df["Filename"].to_list()
    patient_ids = csv_df["patient id"].to_list()
    
    # Creates a dict to list values for all columns that will be added to txt_df
    new_columns_dict = { "patient id": [], "source": [], "country": [], "sex": [], "age": [], 
                         "slice selection": [], "verified finding": [], "view": [], "modality": []}
    
    # Iterates through filenames to match all filenames to corresponding patient_ids
    for idx, fname in enumerate(filenames):
        
        # Gets a patient_id from metadata.csv that matches the current filename in txt_df
        p_id = match_fname2pid( fname, patient_ids )
        
        # Uses the current sample's patient id to extract its metadata from metadata.csv
        csv_row = csv_df.loc[p_id]
        
        # Iterates new_columns_dict's keys to add the elements from csv_row to their corresponding list
        for key in new_columns_dict.keys():
            
            # Appends the correct element from csv_row to its corresponding list in new_columns_dict
            new_columns_dict[key].append(csv_row[key])
            
        
        # Prints the progress
        if (idx % 2500 == 0) or (idx+1 == len(filenames)):
            print("Matching Filenames and Patient_Ids: {}/{}".format(idx+1, len(filenames)), end="\r")
        
    # Once the lists in new_columns_dict are completely filled, adds each list as a new column to txt_df
    for key in new_columns_dict.keys():
        txt_df[key] = new_columns_dict[key]
    
    # Saves the dataframe as a csv if requested
    if save_df:
        # Generates a path to the new csv file
        path_to_csv = os.path.join( import_dir, "temp_data", "2_combined_metadata.csv")
        
        # Creates a new directory if it doesn't exist
        if not os.path.exists( os.path.dirname(path_to_csv) ):
            os.makedirs( os.path.dirname(path_to_csv) )
        
        # Saves the dataframe
        txt_df.to_csv( path_to_csv, index = False, sep = ";" )
    
    return txt_df

combined_data_df = combine_dfs(data_dir, txt_data_df, save_df = False)
combined_data_df.head()

Matching Filenames and Patient_Ids: 425024/425024

Unnamed: 0,Filename,Partition,Class,x_min,y_min,x_max,y_max,patient id,source,country,sex,age,slice selection,verified finding,view,modality
0,NCP_96_1328_0032.png,train,COVID-19,9,94,512,405,NCP_96,CNCB,China,M,74.0,Expert,Yes,Axial,CT
1,NCP_96_1328_0035.png,train,COVID-19,10,106,512,405,NCP_96,CNCB,China,M,74.0,Expert,Yes,Axial,CT
2,NCP_96_1328_0036.png,train,COVID-19,10,105,512,406,NCP_96,CNCB,China,M,74.0,Expert,Yes,Axial,CT
3,NCP_96_1328_0037.png,train,COVID-19,11,104,512,406,NCP_96,CNCB,China,M,74.0,Expert,Yes,Axial,CT
4,NCP_96_1328_0038.png,train,COVID-19,11,103,512,406,NCP_96,CNCB,China,M,74.0,Expert,Yes,Axial,CT


# Adjust Column Names & Order

In [5]:
def adjust_columns( import_dir, df, new_col_order ):
    
    # Changes the order of the columns in the dataframe
    df = df.reindex( columns = new_col_order )
    
    # Creates new column names replacing " " for "_" and changing all columns to lower-case
    col_dict = { col: col.lower().replace(" ", "_") for col in new_col_order }
    
    # Replaces old column names for new ones
    df.rename(columns = col_dict, inplace = True)
    
    # Replaces empty cells with "N/A"
    df.fillna("N/A", inplace = True)
    
    # Generates a path to the new csv file
    path_to_csv = os.path.join( import_dir, "combined_metadata.csv")

    # Saves the dataframe as a csv
    df.to_csv( path_to_csv, index = False, sep = ";" )
    
    return df

# Sets the new column order for the combined_data_df
new_column_order = [ "Filename", "patient id", "source", "Class", "country", "sex", "age", 
                     "Partition", "slice selection", "x_min", "y_min", "x_max", "y_max", 
                     "verified finding", "view", "modality" ]

adjusted_data_df = adjust_columns( data_dir, combined_data_df, new_column_order )
adjusted_data_df.head()

Unnamed: 0,filename,patient_id,source,class,country,sex,age,partition,slice_selection,x_min,y_min,x_max,y_max,verified_finding,view,modality
0,NCP_96_1328_0032.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,9,94,512,405,Yes,Axial,CT
1,NCP_96_1328_0035.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,106,512,405,Yes,Axial,CT
2,NCP_96_1328_0036.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,105,512,406,Yes,Axial,CT
3,NCP_96_1328_0037.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,104,512,406,Yes,Axial,CT
4,NCP_96_1328_0038.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,103,512,406,Yes,Axial,CT


# Move Obsolete Files

In [6]:
# Directory to store obsolete files
obsolete_dir = os.path.join( data_dir, "obsolete_metadata")

# Creates obsolete dir if needed
if not os.path.exists(obsolete_dir):
    os.makedirs(obsolete_dir)

# Moves files to the obsolete dir
for fname in ["metadata.csv", "train_COVIDx_CT-3A.txt", "val_COVIDx_CT-3A.txt", "test_COVIDx_CT-3A.txt"]:
    
    # Old path to file
    i_path = os.path.join( data_dir, fname )
    
    # New path to file
    e_path = os.path.join( obsolete_dir, fname )
    
    # Moves the file
    os.rename(i_path, e_path)