# Mapping

Creamos el grafo:

In [1]:
from rdflib import Graph, Namespace, Literal, FOAF, RDF, XSD
g = Graph()
dat = Namespace('https://junjingw.github.io/bacteria-ontologia/data/')
ont = Namespace('https://junjingw.github.io/bacteria-ontologia/ontology#')

Obtenemos los datos subidos al github:

In [2]:
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/Junjingw/bacteria-ontologia/refs/heads/main/all_ARM_taxid.csv',
    usecols=[0,1,2,3,4]
)
df[:2]

Unnamed: 0,pathogen_name,taxonid,ARG_name,frequency,antibiotic_name
0,Staphylococcus aureus,1280,Tetracycline-resistant ribosomal protection pr...,0.12148,Tetracycline antibiotic
1,Staphylococcus aureus,1280,VanH,0.00053,glycoPeptide antibiotic


In [3]:
# Contadores para los identificadores únicos de cada tipo de entidad
n_bac = 1
n_arg = 1
n_mut = 1
n_ant = 1

Enlazamos los datos y los añadimos al grafo:

In [4]:
bacteria_to_uri = dict()
bacterias_df = df[['pathogen_name', 'taxonid']].drop_duplicates().set_index('pathogen_name')
for bacteria in bacterias_df.index:
    # Extract linked data
    bacteria_uri = dat[f'bac{n_bac}']
    taxonid = bacterias_df.taxonid[bacteria]
    bacteria_to_uri[bacteria] = bacteria_uri
    
    # Add corresponding triples
    g.add((bacteria_uri, RDF.type, ont.Bacteria))
    g.add((bacteria_uri, FOAF.name, Literal(bacteria)))
    g.add((bacteria_uri, ont.taxonId, Literal(taxonid, datatype=XSD.integer)))
    
    n_bac += 1

In [5]:
antibiotic_to_uri = dict()
for ant in df.antibiotic_name.str.split(',').explode().unique():
    # Extract linked data
    antibiotic_uri = dat[f'ant{n_ant}']
    antibiotic_to_uri[ant] = antibiotic_uri
    
    # Add corresponding triples
    g.add((antibiotic_uri, RDF.type, ont.Antibiotic))
    g.add((antibiotic_uri, FOAF.name, Literal(ant)))
    
    n_ant += 1

In [6]:
arg_to_uri = dict()
arg_df = df[['ARG_name', 'antibiotic_name']].set_index('ARG_name').antibiotic_name.str.split(',').explode()
for arg in arg_df.index.unique():
    # Extract linked data
    arg_uri = dat[f'arg{n_arg}']
    arg_to_uri[arg] = arg_uri
    
    # Add corresponding triples
    g.add((arg_uri, RDF.type, ont.ARG))
    g.add((arg_uri, FOAF.name, Literal(arg)))
    
    # Add antibiotic resistance information
    if isinstance(arg_df[arg], str):
        g.add((arg_uri, ont.resistantTo, antibiotic_to_uri[arg_df[arg]]))
    else:
        for ant in arg_df[arg].unique():
            g.add((arg_uri, ont.resistantTo, antibiotic_to_uri[ant]))
            
    n_arg += 1

In [7]:
mut_to_uri = dict()
mut_df = df.drop(columns=['taxonid', 'antibiotic_name'])
for mutation in mut_df.index:
    # Extract linked data
    mutation_uri = dat[f'mut{n_mut}']
    bacteria_uri = bacteria_to_uri[mut_df.pathogen_name[mutation]]
    arg_uri = arg_to_uri[mut_df.ARG_name[mutation]]
    frequency = mut_df.frequency[mutation]
    mut_to_uri[mutation] = mutation_uri
    
    # Add corresponding triples
    g.add((mutation_uri, RDF.type, ont.Mutation))
    g.add((mutation_uri, ont.frequency, Literal(frequency, datatype=XSD.float)))
    g.add((mutation_uri, ont.linkedToARG, arg_uri))
    g.add((bacteria_uri, ont.hasMutation, mutation_uri))
    
    n_mut += 1
    

Realizamos _reconciling_ con wikidata para obtener los signos (síntomas, causas y efectos) de cada patógeno:

In [8]:
import wikidata_requests as wkd_req
temp = bacterias_df.reset_index()
temp['signs'] = temp.pathogen_name.apply(wkd_req.get_qualifier).apply(wkd_req.get_signs)
bacterias_df = temp.set_index('pathogen_name')

In [9]:
exploded_signs_df = bacterias_df.signs.explode()
for bacteria in exploded_signs_df.index.unique():
    signs = exploded_signs_df[bacteria]
    if isinstance(signs, str):
        g.add((bacteria_to_uri[bacteria], ont.hasSign, Literal(signs)))
    else:
        for sign in signs:
            g.add((bacteria_to_uri[bacteria], ont.hasSign, Literal(sign)))

Guardamos todos estos datos en nuestro triplestore (a esta escala será simplemente un archivo turtle):

In [10]:
g.serialize('data.ttl', format='turtle')

<Graph identifier=N9b3dd1450ad644e4a39fc427b626ec06 (<class 'rdflib.graph.Graph'>)>