# Mapping

In [8]:
import pandas as pd
import glob
from rdflib import Graph, Namespace, Literal, FOAF, RDF, XSD
# Modulo creado por nosotros
import wikidata_requests as wkd_req

Todos los archivos en la carpeta data con la estructura AMR_taxid_*.txt serán los convertidos.

In [9]:
files = 'data/AMR_taxid_*.txt'
list_files = glob.glob(files)
list_files

['data\\AMR_taxid_1280.txt',
 'data\\AMR_taxid_1639.txt',
 'data\\AMR_taxid_1773.txt',
 'data\\AMR_taxid_197.txt',
 'data\\AMR_taxid_28901.txt',
 'data\\AMR_taxid_573.txt']

Unimos todos los archivos y preprocesamos los datos:

In [10]:
df = pd.concat(
    [
        pd.read_csv(
            file_path,
            sep='\t',
            usecols=[
                'pathogen_name',
                'taxonid',
                'ARG_name',
                'frequency',
                'antibiotic_name',
                'drugclass'
            ]
        ) for file_path in list_files
    ],
    ignore_index=True
)

duplicates = df.duplicated().sum()
print(f'There are {df.duplicated().sum()} duplicated rows.')
if duplicates:
    print('Removing them...')
    df.drop_duplicates(inplace=True)
    
# Drop files where there isn't either antibiotic_name or drugclass
rows_to_drop = df[df.drugclass.isna() & df.antibiotic_name.isna()].index
df.drop(rows_to_drop, inplace=True)

# If there isn't an antibiotic_name, it means it is inmune to the whole drugclass
df.fillna({'antibiotic_name': df.drugclass}, inplace=True)

# Drugclass is not needed anymore
df.drop(columns='drugclass', inplace=True)

# df.to_csv('all_ARM_taxid.csv', index=False)

There are 0 duplicated rows.


Creamos el grafo:

In [11]:
g = Graph()
dat = Namespace('https://junjingw.github.io/bacteria-ontologia/data/')
ont = Namespace('https://junjingw.github.io/bacteria-ontologia/ontology#')

# Counters for the unique identifiers for each entity
n_bac = 1
n_arg = 1
n_mut = 1
n_ant = 1

Enlazamos los datos y los añadimos al grafo:

In [12]:
bacteria_to_uri = dict()
bacterias_df = df[['pathogen_name', 'taxonid']].drop_duplicates().set_index('pathogen_name')
for bacteria in bacterias_df.index:
    # Extract linked data
    bacteria_uri = dat[f'bac{n_bac}']
    taxonid = bacterias_df.taxonid[bacteria]
    bacteria_to_uri[bacteria] = bacteria_uri
    
    # Add corresponding triples
    g.add((bacteria_uri, RDF.type, ont.Bacteria))
    g.add((bacteria_uri, FOAF.name, Literal(bacteria)))
    g.add((bacteria_uri, ont.taxonId, Literal(taxonid, datatype=XSD.integer)))
    
    n_bac += 1

In [13]:
antibiotic_to_uri = dict()
for ant in df.antibiotic_name.str.split(',').explode().unique():
    # Extract linked data
    antibiotic_uri = dat[f'ant{n_ant}']
    antibiotic_to_uri[ant] = antibiotic_uri
    
    # Add corresponding triples
    g.add((antibiotic_uri, RDF.type, ont.Antibiotic))
    g.add((antibiotic_uri, FOAF.name, Literal(ant)))
    
    n_ant += 1

In [14]:
arg_to_uri = dict()
arg_df = df[['ARG_name', 'antibiotic_name']].set_index('ARG_name').antibiotic_name.str.split(',').explode()
for arg in arg_df.index.unique():
    # Extract linked data
    arg_uri = dat[f'arg{n_arg}']
    arg_to_uri[arg] = arg_uri
    
    # Add corresponding triples
    g.add((arg_uri, RDF.type, ont.ARG))
    g.add((arg_uri, FOAF.name, Literal(arg)))
    
    # Add antibiotic resistance information
    if isinstance(arg_df[arg], str):
        g.add((arg_uri, ont.resistantTo, antibiotic_to_uri[arg_df[arg]]))
    else:
        for ant in arg_df[arg].unique():
            g.add((arg_uri, ont.resistantTo, antibiotic_to_uri[ant]))
            
    n_arg += 1

In [15]:
mut_to_uri = dict()
mut_df = df.drop(columns=['taxonid', 'antibiotic_name'])
for mutation in mut_df.index:
    # Extract linked data
    mutation_uri = dat[f'mut{n_mut}']
    bacteria_uri = bacteria_to_uri[mut_df.pathogen_name[mutation]]
    arg_uri = arg_to_uri[mut_df.ARG_name[mutation]]
    frequency = mut_df.frequency[mutation]
    mut_to_uri[mutation] = mutation_uri
    
    # Add corresponding triples
    g.add((mutation_uri, RDF.type, ont.Mutation))
    g.add((mutation_uri, ont.frequency, Literal(frequency, datatype=XSD.float)))
    g.add((mutation_uri, ont.linkedToARG, arg_uri))
    g.add((bacteria_uri, ont.hasMutation, mutation_uri))
    
    n_mut += 1
    

Realizamos _reconciling_ con wikidata para obtener los signos (síntomas, causas y efectos) de cada patógeno:

In [16]:
temp = bacterias_df.reset_index()
temp['signs'] = temp.pathogen_name.apply(wkd_req.get_qualifier).apply(wkd_req.get_signs)
bacterias_df = temp.set_index('pathogen_name')

In [17]:
exploded_signs_df = bacterias_df.signs.explode()
for bacteria in exploded_signs_df.index.unique():
    signs = exploded_signs_df[bacteria]
    if isinstance(signs, str):
        g.add((bacteria_to_uri[bacteria], ont.hasSign, Literal(signs)))
    else:
        for sign in signs:
            g.add((bacteria_to_uri[bacteria], ont.hasSign, Literal(sign)))

Guardamos todos estos datos en nuestro triplestore (a esta escala será simplemente un archivo turtle):

In [18]:
g.serialize('data.ttl', format='turtle')

<Graph identifier=N784fdef78afa41668699973cd1a16fcf (<class 'rdflib.graph.Graph'>)>