# Dataset example IDEA4RC to OMOP CDM

In [None]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

A continuación, procedemos a crear los ficheros csv para cada tabla de IDEA4RC como muestra de ejemplo para posteriormente hacer el mapeo.

Cada columna la rellenaremos con su "Vocabulary o Variable as modifier" de Athena, si es que alguno de los dos está disponible.

Cuando tengamos las dos disponibles, cogeremos como su único valor su correspondiente "Variable as modifier". Y, meteremos posteriormente su "Vocabulary" a la tabla correspondiente de mapeo a OMOP (en el siguiente código, cuando hay un # con números, significarán los vocabularios correspondientes a otras tablas, es decir, que las variables disponen de "Vocabulary y Variable as modifier".


### patient.IDEA4RCcsv

In [None]:
def non_repeating_number_generator(start, end, X):
    generated_numbers = set()
    count = 0
    while count < X:
        number = random.randint(start, end)
        if number not in generated_numbers:
            generated_numbers.add(number)
            yield number
            count += 1
    return generated_numbers

random_ids = non_repeating_number_generator(0,51,50)
# Crear DataFrame con la columna 'id'
data = {'id': random_ids}
df_patient_IDEA4RC = pd.DataFrame(data)

# Definir las variables y sus posibles valores
replacement_values = {
    'Gender': (8507, 8532),
    'Race': (8527, 8516, 8515, 8657),
    'Birth year': 0,
    'Country of Residence':4329169,
    'Smoking': 1585856, #(36309332, 45883458, 45879404),
    'Cigarettes pack years smoked during life': 903650,
    'Alcohol': 1586197, # (4074035, 4117706, 37204556, 4220362),
    'Height/weight (BMI)': 4245997,
    'Charslon Comorbidity Index': 42538860,
    'Comorbidity': 46235351, #(4188539, 4188540),
    'Myocardial infarction':4329847,
    'Congestive heart failure':319835,
    'Peripheral vascular disease':321052,
    'Cerebrovascular accident (except hemiplegia)':381316,
    'Dementia': 4182210,
    'Chronic pulmonary disease':4186898,
    'Connective tissue disease':4344165,
    'Ulcer':4177703,
    'Mild liver disease':194984,
    'Moderate to severe liver disease': 194984,
    'Diabetes (without complications)': 201820,
    'Diabetes with end organ damage': 201820,
    'Hemiplegia': 374022,
    'Moderate to severe renal disease': 198124,
    'Solid tumor (non metastatic)': 443392,
    'Metastatic solid tumor': 443392,
    'Leukemia': 317510,
    'Lymphoma': 44499278,
    'Multiple myeloma': 437233,
    'AIDS': 4267414,
    'Eastern Cooperative Oncology Group performance status (ECOG PS) at diagnosis': 36305384,
    'ECOG PS label': 36303470, #Vocabulary = Variable as modifier
    'Karnofsy index at diagnosis': 4169154,
    'Karnofsy index label': 36303744, #Vocabulary = Variable as modifier
    'No Genetic syndrome WHO 2020': 37204336, #4211787
    'Olliers disease': 4145177,
    'Maffuci syndrome': 4187683,
    'Li-Fraumeni syndrome': 4323645,
    'McCune-Albright syndrome': 37117262,
    'Multiple osteochondromas': 37396802,
    'Neurofibromatosis type 1': 377252,
    'Rothmund-Thomson syndrome': 4286355,
    'Werner syndrome': 4197821,
    'Retinoblastoma': 4158977,
    'Paget disease': 75910,
    'Other Genetic syndrome WHO 2020': 1340204,
    'Occurrence of other cancer': 1340204, #4266186
    'Previous cancer treatment': 1340204 #(4273629, 4170755, 4121697, 42535584, 4295112)
}

# Función para obtener un valor aleatorio de un elemento en cada columna del diccionario anterior 
def get_random_value(val):
    if isinstance(val, tuple):
        return np.random.choice(val)
    return val

# Añadir las columnas del diccionario al DataFrame
for key, value in replacement_values.items():
    df_patient_IDEA4RC[key] = [get_random_value(value) for _ in range(len(df_patient_IDEA4RC['id']))]

# Ajustar las opciones de pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Mostrar el DataFrame resultante
df_patient_IDEA4RC.head(5)

Observamos que tenemos un dataframe con los números correspondientes al Vocabulario de OMOP. Los números "0" significan valores nulos que tendrán que someterse a cambios al número que corresponda en ATHENA.

In [None]:
# Guardar DataFrame en un archivo CSV
df_patient_IDEA4RC.to_csv('patientsIDEA4RC.csv', index=False)

### hospitalDataIDEA4RC.csv

In [None]:
def hospitalNameGen():
    prefijos = ['San', 'Santa', 'Nuestra Señora', 'Sagrado Corazón', 'Sanatorio', 'Clínica', 'Centro']
    nombres = ['María', 'Jorge', 'Lourdes', 'José', 'Cruz', 'Esperanza', 'Salud', 'Vida', 'Cristo', 'Ángel', 'Victoria', 'Amparo']
    tipos = ['Hospital', 'Clínica', 'Centro Médico', 'Instituto Médico', 'Pabellón', 'Policlínico', 'Sanatorio', 'Consultorio', 'Asistencia Médica']

    return f"{random.choice(prefijos)} {random.choice(nombres)} {random.choice(tipos)} {random.choice(['de la Paz', 'de la Salud', 'General', 'Regional', 'Metropolitano', 'Especializado'])}"

hospital_names = [hospitalNameGen() for _ in range(10)]
df_hospital_Data_IDEA4RC = pd.DataFrame()
df_hospital_Data_IDEA4RC['id'] = (non_repeating_number_generator(0, 50, 10)).copy()
df_hospital_Data_IDEA4RC['Hospital Name']=[
    hospital_names[_] for _ in range(len(df_hospital_Data_IDEA4RC['id']))
]

In [None]:
df_hospital_Data_IDEA4RC.to_csv('hospitalDataIDEA4RC.csv', index=False)

In [None]:
df_hospital_Data_IDEA4RC.head(3)

### patientsFollowUpIDEA4RC.csv

In [None]:
patient_f_u_columns = {
    #"Patient": ,
    "Status at last follow-up": 0,
    "Patient Follow Up date": 0, #no puede ser un valor arbitrario, explicado en el archivo de mapeo de esta tabla
    "New cancer diagnosis": (4188539, 4188540),
    "Date of new cancer diagnosis": 0, 
    "New cancer topography": (36534215, 44498973),  # y otros valores0, 
    "Last Contact": 0,
}

df_patients_FollowUp_IDEA4RC = pd.DataFrame()

# Crear el nuevo dataframe con la columna 'id' de df
df_patients_FollowUp_IDEA4RC['Patient'] = df_patient_IDEA4RC[['id']].copy()

# Añadir las columnas del diccionario al DataFrame
for key, value in patient_f_u_columns.items():
    df_patients_FollowUp_IDEA4RC[key] = [get_random_value(value) for _ in range(len(df_patients_FollowUp_IDEA4RC['Patient']))]

In [None]:
# Guardar DataFrame en un archivo CSV
df_patients_FollowUp_IDEA4RC.to_csv('patientsFollowUpIDEA4RC.csv', index=False)

In [None]:
df_patients_FollowUp_IDEA4RC.head(3)

### hospitalPatientsRecordIDEA4RC.csv

In [12]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

# Definir la función para generar una fecha aleatoria
def randDate(start_date=datetime(1900, 1, 1), end_date=datetime.now()):
    # Calcular la diferencia en segundos entre las fechas de inicio y fin
    delta = end_date - start_date
    int_delta = delta.days * 24 * 60 * 60
    random_second = random.randrange(int_delta)
    return start_date + timedelta(seconds=random_second)

# Crear el DataFrame vacío
df_hospital_PatientsRecord_IDEA4RC = pd.DataFrame()

# Rellenar la columna 'id' con números enteros aleatorios
df_hospital_PatientsRecord_IDEA4RC['id'] = non_repeating_number_generator(0, 300, 100)

# Obtener listas de ids de hospitales y pacientes
hospital_names = df_hospital_Data_IDEA4RC['id'].tolist()
patients_ids = df_patient_IDEA4RC['id'].tolist()

# Rellenar las columnas 'Patient' y 'Hospital' con elecciones aleatorias
df_hospital_PatientsRecord_IDEA4RC['Patient'] = np.random.choice(patients_ids, size=50)
df_hospital_PatientsRecord_IDEA4RC['Hospital'] = np.random.choice(hospital_names, size=50)

# Rellenar la columna 'Date of first contact with the hospital' con fechas aleatorias
df_hospital_PatientsRecord_IDEA4RC['Date of first contact with the hospital'] = [
    randDate() for _ in range(50)
]

# Ver el resultado
print(df_hospital_PatientsRecord_IDEA4RC.head())


SyntaxError: invalid syntax (4219028881.py, line 18)

In [13]:
df_hospital_PatientsRecord_IDEA4RC.to_csv('hospitalPatientsRecordIDEA4RC.csv', index=False)

NameError: name 'df_hospital_PatientsRecord_IDEA4RC' is not defined

In [None]:
df_hospital_PatientsRecord_IDEA4RC.head(3)

### cancerEpisodeDEA4RC.csv

It seems that I'm dealing with the issue of what to do with "Histology (WHO 2017) H&N subgroup." I'm not sure if I should leave it as plain text or if it might get lost when transferred to another database? Also, there's a code in "Histology Squamous" that appears to have no definition. The same issue arises with "Histology Adenocarcinoma." What to do with the "NOS".

For Histology Sinonasal undifferentiated carcinoma I am following the example given in PatientFollowUp, and I am assigning it a 0 value since we do not have any "clear value" as Vocabulary or Variable as a modifier. I need more information. Same for other subsite, but I consider this one to be more clear.

In [None]:
replacement_values= {
    'Type of biopsy' : (4171863,4321878,4321986, 4228202,4279903),
    'Biopsy done by' : hospital_names, #Institution where diagnostic procedure was performed, culd be the ID assigned from the original hospital.
    'Age at diagnosis' : np.random.randint(0, 120, size=100), #We generate random numbers that will work as the age
    'Radiotherapy induced sarcoma' : 37117814,
    'Grading' : 4159955,
    'Histology Squamous' : (4078953, 4147603, 4206785,4277415,4247661,4029973,4298030,4191609,37156145,37204531,37204532),
    'Histology Adenocarcinoma' : (4312929, 4050978, 4247921, 4238334, 605208,4022895, 4253608, 4030121, 4232456, 4164740, 4148292, 
                                  4146684, 4287334, 37116978, 4029680, 4182993, 37116976, 4271564, 4212379),
    'Histology Neuroendocrine' : (6714029,37018672,4029971),
    'Histology Odontogenic Carcinoma' : (4098585, 37153944, 37158144, 37153976, 37117009, 37116887, 37116966, 37312303),
    'Histology Sinonasal undifferentiated carcinoma' : 0,
    'Histology subgroup Adipocytic tumours' : (4102790, 4284833, 4101123, 4010104, 4300687),
    'Histology subgroup Fibroblastic and myofibroblastic tumours' : (45766452, 4264204, 45766468, 4030132, 4295283, 4030130, 4028562, 4021373,
                                                                     45766454,4029679, 45766450, 4178132, 4202374, 45771359, 45771361, 45766456),
    'Histology subgroup So-called fibrohistiocytic tumours' : (4029525, 4218898, 4030140, 45766525, 4029655),
    'Vascular tumours' : (4207381, 3661612, 4281384, 45766493, 4030138, 45766494, 45766492),
    'Pericytic (perivascular) tumours' : 40571984,
    'Smooth muscle tumours' : (4327493, 607803, 600675, 4029023, 4201415, 4173148, 4059632, 4029023),
    'Skeletal muscle tumours' : (4031038, 4272931, 4298312, 4029528, 4029024),
    'Chondro-osseous tumours' : 4298240,
    'Peripheral nerve sheath tumours': (4062426, 4102399, 4005360, 4027842),
    'Tumours of uncertain differentiation' : (4096931, 4029526, 45766448, 4268491, 4244886, 4339208, 4155072, 4328092, 4029522, 4307938, 45771358,
                                              45766447,4304780, 45766495, 4029021, 4239519, 4028565, 45766485, 4239956, 4029526),
    'Undifferentiated small round cell sarcomas of bone and soft tissue' : 45766528,
    'Endometrial stromal and related tumours' : (4290926,4028557),
    'Miscellaneous mesenchymal tumors': (45771357, 4040991, 4323562, 4337106, 4028710, 4029172, 4028709, 4029173, 42872917, 37311513),
    'Mixed epithelial and mesenchymal tumours': (4135215),
    'Nasal cavity and paranasal sinuses subsite': (4088327, 4050631, 4182234, 4204534, 4097172),
    'Nasopharynx subsite': (4068978, 4146741, 4323979, 4220236),
    'Hypopharynx subsite': (4147741, 4262980, 4102600, 4268741),
    'Oropharynx subsite': (4169910, 4177005, 4095284, 4138234, 4068841, 4065073, 4173123, 4283238, 4078066, 4045435, 4235649),
    'Larynx subsite': (4047227, 4001049, 4002724, 4279711),
    'Oral cavity subsite': (36769609, 4077986, 4172794, 4047029, 4210919, 4181419, 4103066, 44498040, 4234542, 4057160, 4144069, 36769865),
    'Lip subsite': (4027546, 4134589, 4027545, 36769700, 36768548, 4245168, 4307358),
    'Upper and Lower limbs subsite': (4310976, 4220073, 4032953, 4027910, 4348994, 4165266, 4211098, 
                                   4142359, 4107527, 4217806, 4138531, 4090445, 4291935, 4088464, 
                                   42605854, 4136969, 4109459, 4263335, 4096078, 4157836),
    'Trunk wall subsite': (4088614, 4244412, 4330865),
    'Intra abdominal subsite': (4220536, 4271699, 4193451, 4154880, 4157844, 4314264, 4094242, 4229753, 4075991, 4288283, 4302112, 4065077,
                                4316356, 4215634, 4144541, 4233095),
    'Intra thoracic subsite': (4213162, 4217142, 4150673, 4088610, 4167648, 4001033, 4035327, 4272388),
    'Genito urinary subsite': (4047078, 4271678, 4078617, 4232481, 4034365, 4277442, 4172591, 4238088, 4077311, 4149192, 4049448, 4151444, 4296022, 4166066),
    'Head and Neck subsite': (4122633),
    'Breast subsite': (4298444),
    'Other subsite' : 0,
    'Tumor Size': (36768664),
    'Depth' : (36768749, 36768911),
    'Biopsy Mitotic count' : 4227243,
    'Plasmatic EBV DNA at baseline' : 3043849,
    'HPV status' : 46236082,
    'CRP – C reactive protein tested ' : 3000965
}

df_cancer_Episode_IDEA4RC = pd.DataFrame()
df_cancer_Episode_IDEA4RC['id']= non_repeating_number_generator(0, 300, 80)
df_cancer_Episode_IDEA4RC['Patient'] = np.random.choice(patients_ids, size=len(df_cancer_Episode_IDEA4RC['id']))
df_cancer_Episode_IDEA4RC['Date of diagnosis (biopsy or surgical piece)']= [
    randDate() for _ in range(len(df_cancer_Episode_IDEA4RC['id']))
]

for key, value in replacement_values.items():
    df_cancer_Episode_IDEA4RC[key] = [get_random_value(value) for _ in range(len(df_cancer_Episode_IDEA4RC['id']))]


In [None]:
df_cancer_Episode_IDEA4RC.to_csv('cancerEpisodeIDEA4RC.csv', index=False)

In [None]:
df_cancer_Episode_IDEA4RC.head(5)