# Project - Dataset creation

## Initialization

Import libraries

In [1]:
import pandas as pd
import numpy as np
from time import time, ctime, strftime, gmtime

Import parallel computing libraries and register processors

In [2]:
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import Parallel, parallel_backend, register_parallel_backend

On Noto, run `ipcontroller --ip="*"`

In [3]:
c = Client(profile='default')
print('profile:', c.profile)
print("IDs:", c.ids) # Process id numbers
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

profile: default
IDs: [0, 1, 2, 3, 4, 5, 6, 7]


Read datasets, inspired from [this study](https://www.kaggle.com/cgueret/covid-19-risk-factor-predictor/output).

In [4]:
predictions = pd.read_csv('Kaggle/predicted_covid19_risk_factors.csv')
targets = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_target_list.csv.gz',
                          compression='gzip')
diseases = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_disease_list.csv.gz',
                           compression='gzip')

Explore predictions dataset

In [5]:
predictions.head()

Unnamed: 0.1,Unnamed: 0,Risk Factors,Score
0,0,Platelet storage pool disease,0.852844
1,1,abnormality of the urinary system physiology,0.828951
2,2,Bronchiolitis,0.781315
3,3,endocrine system disease,0.774616
4,4,Hernia,0.766383


Describe predictions dataset

In [6]:
predictions['Risk Factors'].describe()

count                                   7217
unique                                  7217
top       tumor of cranial and spinal nerves
freq                                       1
Name: Risk Factors, dtype: object

In [7]:
diseases.head()

Unnamed: 0,efo_id,disease_full_name,number_of_associations
0,EFO_1000984,inflammatory breast carcinoma,174
1,MONDO_0004093,esophageal basaloid carcinoma,1
2,EFO_0006352,laryngeal squamous cell carcinoma,796
3,EFO_1000514,Salivary Gland Adenosquamous Carcinoma,0
4,EFO_1001965,pharyngeal squamous cell carcinoma,428


Link disease EFO code to Covid Score.

In [8]:
prediction_extended = predictions.merge(diseases,
                              how = 'left',
                              left_on = 'Risk Factors',
                              right_on = 'disease_full_name')[['efo_id', 'disease_full_name', 'Score']]
prediction_extended.sort_values(by = 'efo_id', inplace = True, ascending = True)
prediction_extended.reset_index(inplace=True, drop = True)
prediction_extended.head()

Unnamed: 0,efo_id,disease_full_name,Score
0,DOID_0050890,synucleinopathy,0.001827
1,EFO_0000094,B-cell acute lymphoblastic leukemia,0.001435
2,EFO_0000095,chronic lymphocytic leukemia,0.002511
3,EFO_0000096,neoplasm of mature B-cells,0.003721
4,EFO_0000174,Ewing sarcoma,0.000201


In [9]:
print('There are {} lines (i.e. diseases) in the dataset.'.format(prediction_extended.shape[0]))

There are 7217 lines (i.e. diseases) in the dataset.


---

## Function definitions

In [10]:
import glob

def dir_stats(dir_info = True,
              directory = "200_by_100/",
              file_prefix = "X_raw_disease_200_target_100_",
              file_suffix = ".csv"):
    n_files = len(glob.glob(directory + file_prefix + "*" + file_suffix))
    
    if dir_info:
        print('Number of files in directory {} : {}'.format(directory,n_files))
        
    return n_files

In [11]:
from datetime import datetime

def get_csvs(dir_info = True,
             directory = "200_by_100/",
             file_prefix = "X_raw_disease_200_target_100_",
             file_suffix = ".csv"):
    
    n_files = dir_stats(dir_info = dir_info,
                        directory = directory,
                        file_prefix = file_prefix,
                        file_suffix = file_suffix)

    if n_files == 0:
        done_diseases = []
        done_targets = []
        df = None
    else:
        for i in range(n_files):
            if i == 0:
                df = pd.read_csv(directory + file_prefix + str(i) + file_suffix, index_col=0)
            else:
                tmp_df = pd.read_csv(directory + file_prefix + str(i) + file_suffix, index_col=0)
                df = df.append(tmp_df)
        
        done_diseases = df.index.tolist()
        done_targets = df.columns.tolist()
        done_targets.remove('COVID')
    
    return done_diseases, done_targets, n_files, df

In [12]:
def create_lists(list_info = False,
                 disease_ids = prediction_extended.efo_id.tolist(),
                 target_ids = targets.ensembl_id.tolist(),
                 n_diseases = 200,
                 n_targets = 100):

    disease_list = np.random.choice(list(disease_ids), n_diseases).tolist()
    target_list = np.random.choice(list(target_ids), n_targets).tolist()

    disease_list.sort()
    target_list.sort()
    
    if list_info:
        print('Number of diseases : ', n_diseases)
        print('Number of targets :  ', n_targets)

    return disease_list, target_list

In [13]:
def update_lists(done_diseases: list, done_targets: list, n_diseases):
    
    disease_ids = prediction_extended[~prediction_extended.efo_id.isin(done_diseases)].efo_id.tolist()
    target_ids = targets[~targets.ensembl_id.isin(done_targets)].ensembl_id.tolist()
    
    disease_list, target_list = create_lists(list_info = False,
                                             disease_ids = disease_ids,
                                             target_ids = target_ids,
                                             n_diseases = n_diseases,
                                             n_targets = len(done_targets))
    
    return disease_list, target_list

Associations may be checked [here](https://www.targetvalidation.org/disease/EFO_0005774/associations) by replacing the name of the disease in the URL.

Associations may also be queried with [the API](https://api.opentargets.io/v3/platform/docs/swagger-ui#/public/getAssociationFilter).

Here is a sample query : [https://platform-api.opentargets.io/v3/platform/public/association/filter?disease=DOID_0050890&fields=association_score.overall&fields=target.id&fields=disease.id&size=10000](https://platform-api.opentargets.io/v3/platform/public/association/filter?disease=DOID_0050890&fields=association_score.overall&fields=target.id&fields=disease.id&size=10000)

[Open Targets Data Download page](https://www.targetvalidation.org/downloads/data)

In [14]:
from opentargets import OpenTargetsClient

def build_dataset(disease_list: list, target_list: list):
    
    n_diseases = len(disease_list)
    n_targets = len(target_list)

    X_raw = np.zeros((n_diseases,
                      n_targets + 1))

    ot = OpenTargetsClient()
    start_time = time()
    tmp_time = time()

    print('--------------------------------------------------')
    print('BUILD Start time : ', ctime())
    print('--------------------------------------------------')

    with parallel_backend('ipyparallel'):
        
        for disease_id, disease_n in zip(disease_list, range(n_diseases)):
            
            for target_id,target_n in zip(target_list, range(n_targets)):           
                search = ot.filter_associations(disease = disease_id,
                                                target = target_id,
                                                fields = ['association_score.overall',
                                                          'target.id',
                                                          'disease.id'])
                for i, r in enumerate(search):
                    if len(search) > 0 and r['disease']['id'] == disease_id:
                        X_raw[disease_n][target_n] = r['association_score']['overall']
            X_raw[disease_n][n_targets] = prediction_extended.Score[disease_n]
            if (disease_n + 1) % 5 == 0:
                print('--', disease_n + 1, ' of ', n_diseases)
                print('Time : {:.2f} seconds.'.format(time() - start_time))
                print('Time since last print : {:.2f} seconds.'.format(time() - tmp_time))
                tmp_time = time()
    
    print('--------------------------------------------------')
    print('BUILD End time : ', ctime())
    print('--------------------------------------------------')
    return X_raw

In [15]:
def print_dataset(df: pd.DataFrame,
                  disease_list: list,
                  target_list: list,
                  directory = "200_by_100/",
                  file_prefix = "X_raw_disease_200_target_100_",
                  file_n = "0",
                  file_suffix = ".csv"):
    
    pd.DataFrame(df,
                 index = disease_list,
                 columns = target_list + ['COVID']
                ).to_csv(directory + file_prefix + str(file_n) + file_suffix)

In [16]:
def print_full_dataset(dir_info = True,
                       directory = "200_by_100/",
                       file_prefix = "X_raw_disease_200_target_100_",
                       file_suffix = ".csv"):
    
    _, _, _, df = get_csvs(dir_info = dir_info,
                           directory = directory,
                           file_prefix = file_prefix,
                           file_suffix = file_suffix)
    
    dt_string = datetime.now().strftime("%Y_%m_%d-%H_%M_%S-")
    df.to_csv(directory
              + "Full-" + dt_string
              + file_prefix
              + str(df.shape[0]) + '_by_' + str(df.shape[1])
              + file_suffix)

In [17]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [18]:
# MAIN

def create_dataset(n_runs = 1,
                   dir_info = True,
                   directory = "200_by_100/",
                   file_prefix = "X_raw_disease_200_target_100_",
                   file_suffix = ".csv",
                   list_info = False,
                   disease_ids = prediction_extended.efo_id.tolist(),
                   target_ids = targets.ensembl_id.tolist(),
                   n_diseases = 100,
                   n_targets = 200):
    
    g_start_time = ctime()
    print('PROGRAM START : ', g_start_time)
    print('============================================================')
    print('============================================================')
    g_start_time_time = time()
    
    for i in range(n_runs):
        start_time = ctime()
        #print('PROGRAM START : ', start_time)
        start_time_time = time()

        done_diseases, done_targets, n_files, _ = get_csvs(dir_info = dir_info,
                                                           directory = directory,
                                                           file_prefix = file_prefix,
                                                           file_suffix = file_suffix)




        if n_files == 0:
            disease_list, target_list = create_lists(list_info = list_info,
                                                     disease_ids = disease_ids,
                                                     target_ids = target_ids,
                                                     n_diseases = n_diseases,
                                                     n_targets = n_targets)
        else:
            disease_list, _ = update_lists(done_diseases = done_diseases,
                                           done_targets = done_targets,
                                           n_diseases = n_diseases)
            while not len(disease_list) == len(set(disease_list)):
                disease_list, _ = update_lists(done_diseases = done_diseases,
                                               done_targets = done_targets,
                                               n_diseases = n_diseases)
                
            target_list = done_targets
    

        init_time = ctime()
        init_time_time = time()
        init_time_length = time() - start_time_time
        #print('INIT DONE : ', init_time)
        #print('INIT DONE IN {} seconds'.format(strftime("%H:%M:%S", gmtime(init_time_length))))

        X_raw = build_dataset(disease_list, target_list)

        print_dataset(df = X_raw,
                      disease_list = disease_list,
                      target_list = target_list,
                      directory = directory,
                      file_prefix = file_prefix,
                      file_n = n_files,
                      file_suffix = file_suffix)

        print_full_dataset(dir_info = dir_info,
                           directory = directory,
                           file_prefix = file_prefix,
                           file_suffix = file_suffix)

        done_time = ctime()
        done_time_length = time() - init_time_time
        #print('PROGRAM DONE : ', done_time)
        #print('PROGRAM DONE IN {} seconds'.format(strftime("%H:%M:%S", gmtime(done_time_length))))
    
        print('============================================================')
        print('RUN STARTED : ', start_time)
        print('INIT DONE :   ', init_time)
        print('RUN DONE :    ', done_time)
        print('============================================================')
        print('Total execution time : {}'.format(strftime("%H:%M:%S", gmtime(done_time_length))))
        print('============================================================')
    
    g_done_time = ctime()
    g_done_time_length = time() - g_start_time_time
    print('============================================================')
    print('============================================================')
    print('PROGRAM DONE : ', g_done_time)
    print('PROGRAM DONE IN {} seconds'.format(strftime("%H:%M:%S", gmtime(g_done_time_length))))

In [None]:
n_runs = 3

dir_info = True
directory = "50_by_100/"
file_prefix = "X_raw_disease_50_target_100_"
file_suffix = ".csv"

list_info = False
disease_ids = prediction_extended.efo_id.tolist()
target_ids = targets.ensembl_id.tolist()
n_diseases = 50
n_targets = 100

create_dataset(n_runs = n_runs,
               dir_info = dir_info,
               directory = directory,
               file_prefix = file_prefix,
               file_suffix = file_suffix,
               list_info = list_info,
               disease_ids = disease_ids,
               target_ids = target_ids,
               n_diseases = n_diseases,
               n_targets = n_targets)

PROGRAM START :  Sun May 10 11:25:04 2020
Number of files in directory 50_by_100/ : 141


In [22]:
done_diseases, done_targets, n_files, _ = get_csvs(dir_info = dir_info,
                                                           directory = directory,
                                                           file_prefix = file_prefix,
                                                           file_suffix = file_suffix)

print('Done diseases : ', len(done_diseases))
print('Unique diseases : ', len(np.unique(done_diseases)))

import collections
print('============================================================')
print('Duplicates : ', [item for item, count in collections.Counter(done_diseases).items() if count > 1])

Number of files in directory 50_by_100/ : 141
Done diseases :  7050
Unique diseases :  7050
Duplicates :  []


In [23]:
len(disease_ids)

7217