# Libraries

In [1]:
import os, os.path
import numpy as np
from pathlib import Path

import pandas as pd

In [2]:
notebooks_path = Path.cwd().parent
repo_path = notebooks_path.parent
os.chdir(str(notebooks_path))
#print current working directory
print(os.getcwd())

/home/ricardino/Documents/MAIA/tercer_semestre/CAD/Projecte/Machine_Learning/notebooks


# Functions

## Retrive paths

In [12]:
def get_test_paths(classif):
    """paths and id number for test data

    Args:
        classif (str): binary or 3_classes

    Returns:
        files and num: list of paths and list of id numbers
    """
    folder_path = str(repo_path) + f'/images/{classif}/test'
    p = Path(folder_path).glob('**/*')
    files = [x.relative_to(repo_path) for x in p if x.is_file()]
    p = Path(folder_path).glob('**/*')
    num = [str(x.name).replace('.jpg','').replace('xx','').replace('x','') for x in p if x.is_file()]
    
    return files, num
    

In [18]:
def get_paths(classif, set_name, class_name):
    """This function extracts the paths of all files in a folder, given the classifiation problem (binary, 3-class), the set name (train-val-test), and the class name
    (endpoint) of the diagnosis.
    It returns a list with all the paths in string.

    Args:
        classif (str): binary or 3-class
        set_name (str): train, val or test
        class_name (str): lesions name (nevus, etc.)
    """
    #path of the folder to check   
    folder_path = str(repo_path) + f'/images/{classif}/{set_name}/{class_name}'
    p = Path(folder_path).relative_to(str(repo_path)).glob('**/*')
    files = [x for x in p if x.is_file()]
    p = Path(folder_path).relative_to(str(repo_path)).glob('**/*')
    classes = [str(x.name)[:3] for x in p if x.is_file()]
    
    return files, classes


def info_dataframe(classif, set_name, class_name):
    """Returns df with Path, classification, set and class information

    Args:
        classif (_type_): _description_
        set_name (_type_): _description_
        class_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    df = pd.DataFrame() #Save paths in df
    paths_list, classes = get_paths(classif, set_name, class_name) #Estract paths and classes
    paths = pd.DataFrame({ #save in frame
        'path': paths_list, 
        'class': classes
    })
    paths = paths.sort_values('path') #Sort paths by alphabetic order
    df = pd.concat([df,paths], ignore_index=True) #Concatenate with previous info
    df['classif'] = f'{classif}'
    df['set'] = f'{set_name}'
    
    #Assigning label
    if classif=='binary':
        df.loc[df['class'] =='nev', 'label'] = 1
        df.loc[df['class'] !='nev', 'label'] = 0
    if classif=='3_classes':
        df.loc[df['class'] =='bcc', 'label'] = 0
        df.loc[df['class'] =='mel', 'label'] = 1
        df.loc[df['class'] =='scc', 'label'] = 2
        
    
    return df

In [15]:
def info_dataframe_test(classif):

    files, num = get_test_paths(classif)
    df = pd.DataFrame() #Save paths in df
    paths_list, num = get_test_paths(classif) #Estract paths and classes
    paths = pd.DataFrame({ #save in frame
        'path': paths_list, 
        'num': num
    })
    paths = paths.sort_values('path') #Sort paths by alphabetic order
    df = pd.concat([df,paths], ignore_index=True) #Concatenate with previous info
    df['classif'] = f'{classif}'
    df['set'] = f'test'
    df['label'] = 'unknown'
    
    return df

In [16]:
allInfo_df = pd.DataFrame() #Empty dataframef
for classif in ['binary','3_classes']:
        currentInfo = info_dataframe_test(classif)
        allInfo_df = pd.concat([allInfo_df,currentInfo], ignore_index=True)
        
allInfo_df['ID'] = allInfo_df.index
allInfo_df.to_csv(str(repo_path) + '/data/meta_test.csv', sep='\t', index=False)
allInfo_df


Unnamed: 0,path,num,classif,set,label,ID
0,images/binary/test/xx0001.jpg,0001,binary,test,unknown,0
1,images/binary/test/xx0002.jpg,0002,binary,test,unknown,1
2,images/binary/test/xx0003.jpg,0003,binary,test,unknown,2
3,images/binary/test/xx0004.jpg,0004,binary,test,unknown,3
4,images/binary/test/xx0005.jpg,0005,binary,test,unknown,4
...,...,...,...,...,...,...
1236,images/3_classes/test/xxx0222.jpg,0222,3_classes,test,unknown,1236
1237,images/3_classes/test/xxx0223.jpg,0223,3_classes,test,unknown,1237
1238,images/3_classes/test/xxx0224.jpg,0224,3_classes,test,unknown,1238
1239,images/3_classes/test/xxx0225.jpg,0225,3_classes,test,unknown,1239


# 1. Getting Metadata

In [21]:
#Set classification task
classif_list = ['binary','3_classes']
set_list = ['train','val']

allInfo_df = pd.DataFrame() #Empty dataframe
for classif in classif_list:    
    for set_name in set_list:
        if classif=='binary':
            class_list = ['nevus', 'others']
        else:
            class_list = ['bcc', 'mel', 'scc']
        for class_name in class_list:

            currentInfo = info_dataframe(classif, set_name, class_name)
            allInfo_df = pd.concat([allInfo_df,currentInfo], ignore_index=True)

allInfo_df['ID'] = allInfo_df.index
# allInfo_df.to_csv('data/meta_info.csv', sep='\t', index=False)

# 2. Add NH paths

In [13]:
# #Add NH apths to meta csv file as a new column
# meta = pd.read_csv(str(repo_path) + '/data/meta_info.csv', sep='\t') #For labels
# meta['NH_path'] = [x.replace('images','images_NH') for x in meta.path.values]
# #save meta file as csv
# meta.to_csv(str(repo_path) + '/data/meta_info.csv', sep='\t', index=False)

# Creating class to manipulate metadata

In [249]:
class path_label():
    def __init__(self, meta, classif, set_name) -> None:
        meta = meta.loc[meta['classif'] == classif] #Filter by classif
        meta = meta.loc[meta['set'] == set_name] #Filter by set
        self.paths = list(meta.path)
        self. labels = np.array(meta.label)