# Process COHD Clinical Data

Jupyter Notebook to download and preprocess files for the [COHD Clinical Data](http://cohd.smart-api.info/) transformation to BioLink RDF.

Sample files are [available on GitHub](https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/resources/cohd-sample). The complete data comes with a 27G

### Download files

In [9]:
import os
import glob
import requests
import functools
import shutil
import pandas as pd 

def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            r.raw.read = functools.partial(r.raw.read, decode_content=True)
            shutil.copyfileobj(r.raw, f)
    print(local_filename + ' downloaded.')
    if local_filename.endswith('.gz') or local_filename.endswith('.zip'):
        shutil.unpack_archive(local_filename, '.')
    return local_filename

def convert_tsv_to_csv(tsv_file):
    csv_table=pd.read_table(tsv_file,sep='\t')
    csv_table.to_csv(tsv_file[:-4] + '.csv',index=False)

# Variables and path for the dataset
dataset_id = 'cohd'
input_folder = '/notebooks/workspace/input/' + dataset_id
mapping_folder = '/notebooks/datasets/' + dataset_id + '/mapping'
# Use input folder as working folder
os.chdir(input_folder)

In [10]:
files_to_download = [
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/concepts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/paired_concept_counts_associations.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/dataset.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/domain_concept_counts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/domain_pair_concept_counts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/patient_count.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/single_concept_counts.tsv'
]
# Complete 27G dataset: https://filedn.com/ll1efYfBhLaV67ONaCyMlKh/cohd-v2.tar.gz

# Download each file and uncompress them if needed
for file_url in files_to_download:
  download_file(file_url)

# Convert the 27G TSV to CSV to be processed with the RMLStreamer
convert_tsv_to_csv('paired_concept_counts_associations.tsv')

ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/concepts.tsv (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fe269d1d460>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

### Split the large associations CSV file

In [16]:
with open('paired_concept_counts_associations.csv') as f:
    csv_header = f.readline().strip() 

shutil.rmtree('split')
os.makedirs('split', exist_ok=True)
# Using bash command to split, as it is more efficient for those operations
split_line_count = '100'
os.system('split -l ' + str(split_line_count) + ' paired_concept_counts_associations.csv split/paired_concept_counts_associations-split_')

# Remove the header line (bash command)
os.system('sed -i -e "1d" split/paired_concept_counts_associations-split_aa')

# Iterate over splitted files
shutil.rmtree(mapping_folder + '/split')
os.makedirs(mapping_folder + '/split', exist_ok=True)
count = 0
listing = glob.glob('split/paired_*')
for filename in listing:
    # Rename files from split to set .csv file extension
    os.rename(filename, 'split/paired_concept_counts_associations_' + str(count) + '.csv')
    split_rml_file = 'split/associations-mapping-' + str(count) + '.rml.ttl'
    # Copy RML mapping file in mapping/split folder, and replace the file names with count index
    shutil.copyfile(mapping_folder + '/associations-mapping.rml.ttl', mapping_folder + '/' + split_rml_file)
    with open(mapping_folder + '/' + split_rml_file) as f:
        file_content = f.read()
    file_content = file_content.replace('paired_concept_counts_associations.csv', 'paired_concept_counts_associations_' + str(count) + '.csv')
    with open(mapping_folder + '/' + split_rml_file, "w") as f:
        f.write(file_content)
    # Print command to run on OpenShift DSRI
    print('oc exec  -- /opt/flink/bin/flink run -c io.rml.framework.Main /opt/RMLStreamer.jar --path /mnt/datasets/cohd/mapping/' + split_rml_file + ' --outputPath /mnt/workspace/import/openshift-rmlstreamer-associations-mapping_rml_ttl-cohd-' + str(count) + '.nt --job-name "[d2s] RMLStreamer associations-mapping.rml.ttl - cohd"')
    count += 1

    
# Add header line to each split file (bash command)
add_header_cmd = """sed -i '1s/^/""" + csv_header + """\\n/' split/*.csv"""
os.system(add_header_cmd)

oc exec  -- /opt/flink/bin/flink run -c io.rml.framework.Main /opt/RMLStreamer.jar --path /mnt/datasets/cohd/mapping/split/associations-mapping-0.rml.ttl --outputPath /mnt/workspace/import/openshift-rmlstreamer-associations-mapping_rml_ttl-cohd-0.nt --job-name "[d2s] RMLStreamer associations-mapping.rml.ttl - cohd"
oc exec  -- /opt/flink/bin/flink run -c io.rml.framework.Main /opt/RMLStreamer.jar --path /mnt/datasets/cohd/mapping/split/associations-mapping-1.rml.ttl --outputPath /mnt/workspace/import/openshift-rmlstreamer-associations-mapping_rml_ttl-cohd-1.nt --job-name "[d2s] RMLStreamer associations-mapping.rml.ttl - cohd"
oc exec  -- /opt/flink/bin/flink run -c io.rml.framework.Main /opt/RMLStreamer.jar --path /mnt/datasets/cohd/mapping/split/associations-mapping-2.rml.ttl --outputPath /mnt/workspace/import/openshift-rmlstreamer-associations-mapping_rml_ttl-cohd-2.nt --job-name "[d2s] RMLStreamer associations-mapping.rml.ttl - cohd"
oc exec  -- /opt/flink/bin/flink run -c io.rml.fr

0