# Process COHD Clinical Data

Jupyter Notebook to download and preprocess files for the [COHD Clinical Data](http://cohd.smart-api.info/) transformation to BioLink RDF.

Sample files are [available on GitHub](https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/resources/cohd-sample). Be aware that the complete data comes with a 27G TSV file.

### Download files

In [74]:
import os
import glob
import requests
import functools
import shutil
import pandas as pd 

def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            r.raw.read = functools.partial(r.raw.read, decode_content=True)
            shutil.copyfileobj(r.raw, f)
    print(local_filename + ' downloaded.')
    if local_filename.endswith('.gz') or local_filename.endswith('.zip'):
        print('gogo')
        shutil.unpack_archive(local_filename, '.')
    return local_filename

def convert_tsv_to_csv(tsv_file):
    csv_table=pd.read_table(tsv_file,sep='\t')
    csv_table.to_csv(tsv_file[:-4] + '.csv',index=False)

# Variables and path for the dataset
dataset_id = 'cohd'
input_folder = '/notebooks/workspace/input/cohd'
# Use input folder as working folder
os.chdir(input_folder)

In [75]:
files_to_download = [
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/concepts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/paired_concept_counts_associations.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/dataset.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/domain_concept_counts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/domain_pair_concept_counts.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/patient_count.tsv',
  'https://raw.githubusercontent.com/MaastrichtU-IDS/d2s-scripts-repository/master/resources/cohd-sample/single_concept_counts.tsv'
]
# https://filedn.com/ll1efYfBhLaV67ONaCyMlKh/cohd-v2.tar.gz
for file_url in files_to_download:
  download_file(file_url)

convert_tsv_to_csv('paired_concept_counts_associations.tsv')

concepts.tsv downloaded.
paired_concept_counts_associations.tsv downloaded.
dataset.tsv downloaded.
domain_concept_counts.tsv downloaded.
domain_pair_concept_counts.tsv downloaded.
patient_count.tsv downloaded.
single_concept_counts.tsv downloaded.
done


### Split the large associations CSV file

In [77]:
with open('paired_concept_counts_associations.csv') as f:
    csv_header = f.readline().strip() 

os.makedirs('split', exist_ok=True)
os.system('split -l 100 paired_concept_counts_associations.csv split/paired_concept_counts_associations-split_')

# Remove the header line
os.system('sed -i -e "1d" split/paired_concept_counts_associations-split_aa')

# Rename files
count = 0
listing = glob.glob('split/paired_*')
for filename in listing:
    os.rename(filename, 'split/paired_concept_counts_associations_' + str(count) + '.csv')
    count += 1

# Add header line to each split file
add_header_cmd = """sed -i '1s/^/""" + csv_header + """\\n/' split/*.csv"""
os.system(add_header_cmd)

0