In [1]:
# Imports
# import os
import sys
import json
import pandas as pd

In [14]:
def check_extension(instring, extensions):
    """Given a file as a string and a list of possible extensions,
    returns true if the extension can be found in the file"""
    for extension in extensions:
        if instring.endswith(extension):
            return True


def open_json(json_path):
    """Load the json file"""
    with open(json_path) as file:
        json_dict = json.load(file)
    return json_dict


class Homogeneizer:
    """Homogeneizer object"""

    def __init__(self, filename):
        self.filename = filename
        self.dictionary_path = None
        self.dicionary = None
        self.centre = None
        self.dataframe = None

        # To Do: replace string with local file system for testing
        # Header path can be found in conf/configuration.json

        header_path = "Schemas/configuration.json"
        self.translated_dataframe = pd.DataFrame(
            columns=open_json(header_path)["new_table_headers"]
        )
        return

    def associate_dict(self):
        """Detect the origin centre of the metadata, and finds the corresponding json file to use"""

        # Check name of the file attribute of the object
        # Check schema with all centres and find their json
        # associate centre and json with object
        # raise error when in doubt
        # must check on schema/institution_schemas

        path_to_institution_json = "Schemas/institution_to_schema.json"

        detected = []
        institution_dict = open_json(path_to_institution_json)

        for key in institution_dict.keys():
            # cap insensitive
            print(self.filename.split("/")[-1].lower())
            if key.lower() in self.filename.split("/")[-1].lower():
                detected.append(institution_dict[key])

        if len(set(detected)) == 0:
            print(f"No file could be found matching with the '{self.filename}' filename given.")
            
        elif len(set(detected)) > 1:
            print("some problems arised!!!")  # change this to an elegant form
            sys.exit()  # maybe check which ones are being mixed or when none is being found
        else:
            self.dictionary_path = detected[0]  # first item, they are all equal
            print(f"JSON file found successfully: {self.dictionary_path}")  # delete this after testing

        return

    def load_dataframe(self):
        """Detect possible extensions for the metadata file
        Open it into a dataframe"""

        excel_extensions = [".xlsx", ".xls", ".xlsm", ".xlsb"]
        odf_extension = [".odf"]
        csv_extensions = [".csv"]
        tsv_extensions = [".tsv"]

        if check_extension(self.filename, excel_extensions):
            self.dataframe = pd.read_excel(self.filename, header=0)
        elif check_extension(self.filename, odf_extension):
            # Needs a special package
            self.dataframe = pd.read_excel(self.filename, engine="odf", header=0)
        elif check_extension(self.filename, csv_extensions):
            self.dataframe = pd.read_csv(self.filename, sep=",", header=0)
        elif check_extension(self.filename, tsv_extensions):
            self.dataframe = pd.read_csv(self.filename, sep="\t", header=0)

        return

    def load_dictionary(self):
        """Load the corresponding dictionary"""

        # To Do: replace string with local file system for testing
        path_to_tools = ""
        dict_path = path_to_tools + "Schemas/" + self.dictionary_path
        self.dictionary = open_json(dict_path)
        return

    def translate_dataframe(self):
        """Use the corresponding dictionary to translate the df"""
        # if dictionary is "none" or similar, do nothing

        for key, value in self.dictionary["equivalence"].items():
            if len(value) == 0:
                print(f"Found empty equivalence in the '{self.dictionary_path}' schema: '{key}'")
            elif value in self.dataframe.columns:
                self.translated_dataframe[key] = self.dataframe[value]
            else:
                print(f"Column '{value}' indicated in the '{self.dictionary_path}' schema could not be found.")

        for key, value in self.dictionary["constants"].items():
            if key in self.translated_dataframe.columns:
                self.translated_dataframe[key] = value
            else:
                print(f"Value '{key}' in schema not found in the resulting dataframe")
            

        return

    def verify_translated_dataframe(self):
        """Checks if the dataframe holds all the needed values for the relecov tools suite"""
        
        if self.dataframe.shape[0] != self.translated_dataframe.shape[0]:
            print("Different number of rows after translation")
        else:
            print("Same number of rows after translation")
        
        pass
        return

    def export_translated_dataframe(self):        
        pass
        return
    

infile_path = "Input/ISCIII_reception.xlsx"
ISCIII = Homogeneizer(infile_path)
ISCIII.associate_dict()
ISCIII.load_dataframe()
ISCIII.load_dictionary()
ISCIII.translate_dataframe()
ISCIII.verify_translated_dataframe()

No file could be found matching with the 'ISCIII/reception.xlsx' filename given.


FileNotFoundError: [Errno 2] No such file or directory: 'ISCIII/reception.xlsx'

# Schema template

## Import

In [None]:
import json
import pandas as pd

## Files

In [None]:
infile = "Input/ISCIII_reception.xlsx"
pd.read_excel(infile, sheet_name="Datos", header=0)

## General schemas

### Funcionamiento básico

* Recepción del archivo
* Detección del laboratorio del que se trata
    * Loop por las claves del diccionario de laboratorios, si se detecta uno, se toma por ese
    * En caso de no encontrar ninguna, usar questionnaire para que indique cuál es? **O RAISE ERROR**
    * En caso de encontrar varias, usar questionnaire para seleccionar cuál es?
* Selección del _json_file_path_ usando el diccionario
* Dos opciones:
    * Modificar la dataframe para hacerla la dataframe final
    * Crear la dataframe final e ir rellenándola con la dataframe inicial
* Profit
    


In [None]:
lab_dict = {
            "lab_name" : "json_file_path",
            "lab2_name" : "json_file_path",
            "lab3_name" : "json_file_path",
            "lab4_name" : None,
            }

term_dict = {
    "equivalences" : {"termino_final" : "termino_inicial", 
                      "termino_final_2" : "termino_inicial_2"},
    "constants" : {"termino_final" : "valor constante"},
    "empty" : ["terminos_finales_no_incluidos"],
    "outer" : {"termino_final" : {"localización" : "término_inicial"}}
            }


In [None]:
final_format = [
    "Public Health sample id (SIVIES)",
    "Sample ID given by originating laboratory",
    "Sample ID given by the submitting laboratory",
    "Sample ID given in the microbiology lab",
    "Sample ID given if multiple rna-extraction or passages",
    "Sample ID given for sequencing",
    "ENA Sample ID",
    "GISAID Virus Name",
    "GISAID id",
    "Originating Laboratory",
    "Submitting Institution",
    "Sample Collection Date",
    "Sample Received Date",
    "Purpose of sampling",
    "Biological Sample Storage Condition ",
    "Specimen source",
    "Environmental Material",
    "Environmental System",
    "Collection Device",
    "Host",
    "Host Age",
    "Host Gender",
    "Sequencing Date",
    "Rna Extraction Protocol",
    "Commercial All-in-one library kit",
    "Library Preparation Kit",
    "Enrichment Protocol",
    "If Enrichment Protocol. If Other,Specify",
    "Enrichment panel/assay",
    "If Enrichment panel/assay. If Other, Specify",
    "Enrichment panel/assay version",
    "Number Of Samples In Run",
    "Runid",
    "Sequencing Instrument Model",
    "Flowcell Kit",
    "Source material",
    "Capture method",
    "Sequencing technique",
    "Library Layout",
    "Gene Name 1",
    "Diagnostic Pcr Ct Value 1",
    "Gene Name 2",
    "Diagnostic Pcr Ct Value-2",
    "Analysis Authors",
    "Author Submitter",
    "Authors",
    "Sequence file R1 fastq",
    "Sequence file R2 fastq"
]

## ISCIII schemas

In [None]:
term_dict_ISCIII = {
    "equivalences" : {"Public Health sample id" : "Código SiViEs",
                     "Sample ID given by originating laboratory" : "Ref Hospital",
                     "Sample ID given by the submitting laboratory" : "ID CNM",
                     "Sample ID given in the microbiology lab" : "ID VI-VRP",
                     "Sample ID given if multiple rna-extraction or passages" : "ID VI-VRP",
                     "Sample ID given for sequencing" : "ID VI-VRP",
                     "GISAID id" : "ID GISAID",
                     "Originating Laboratory" : "Hospital",
                     "Sample Collection Date" : "Fecha de toma ",
                     "Sample Received Date" : "Fecha recepción",
                     "Host Age" : "Grupo edad",
                     "Diagnostic Pcr Ct Value 1" : "PCR genE",
                     "Sequencing Date" : "Fecha de secuenciación",
                     "Rna Extraction Protocol" : ""                     
                    },
    "constants" : {},
    "empty" : [],
    "outer" : { }
            }

In [None]:
term_dict_ISCIII