# Project

## Initialization

Import libraries

In [1]:
import pandas as pd
import numpy as np

Import parallel computing libraries and register processors

In [2]:
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import Parallel, parallel_backend, register_parallel_backend

In [3]:
c = Client(profile='default')
print('profile:', c.profile)
print("IDs:", c.ids) # Process id numbers
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

profile: default
IDs: [0, 1, 2, 3]


Read datasets

In [4]:
predictions = pd.read_csv('predicted_covid19_risk_factors.csv')
targets = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_target_list.csv.gz',
                          compression='gzip')
diseases = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_disease_list.csv.gz',
                           compression='gzip')

Explore predictions dataset

In [5]:
predictions.head()

Unnamed: 0.1,Unnamed: 0,Risk Factors,Score
0,0,Platelet storage pool disease,0.852844
1,1,abnormality of the urinary system physiology,0.828951
2,2,Bronchiolitis,0.781315
3,3,endocrine system disease,0.774616
4,4,Hernia,0.766383


Describe predictions dataset

In [6]:
predictions['Risk Factors'].describe()

count                                     7217
unique                                    7217
top       adrenocortical carcinoma, hereditary
freq                                         1
Name: Risk Factors, dtype: object

In [7]:
diseases.head()

Unnamed: 0,efo_id,disease_full_name,number_of_associations
0,EFO_1000984,inflammatory breast carcinoma,174
1,MONDO_0004093,esophageal basaloid carcinoma,1
2,EFO_0006352,laryngeal squamous cell carcinoma,796
3,EFO_1000514,Salivary Gland Adenosquamous Carcinoma,0
4,EFO_1001965,pharyngeal squamous cell carcinoma,428


Link disease EFO code to Covid Score.

In [8]:
prediction_extended = predictions.merge(diseases,
                              how = 'left',
                              left_on = 'Risk Factors',
                              right_on = 'disease_full_name')[['efo_id', 'disease_full_name', 'Score']]
prediction_extended.sort_values(by = 'efo_id', inplace = True, ascending = True)
prediction_extended.reset_index(inplace=True, drop = True)
prediction_extended.head()

Unnamed: 0,efo_id,disease_full_name,Score
0,DOID_0050890,synucleinopathy,0.001827
1,EFO_0000094,B-cell acute lymphoblastic leukemia,0.001435
2,EFO_0000095,chronic lymphocytic leukemia,0.002511
3,EFO_0000096,neoplasm of mature B-cells,0.003721
4,EFO_0000174,Ewing sarcoma,0.000201


In [9]:
print('There are {} lines (i.e. diseases) in the dataset.'.format(prediction_extended.shape[0]))

There are 7217 lines (i.e. diseases) in the dataset.


---

## Function definition

In [10]:
import glob

def dir_stats(show = True,
              directory = "200_by_100/",
              file_prefix = "X_raw_disease_200_target_100",
              file_suffix = ".csv"):
    n_files = len(glob.glob(directory + file_prefix + "*" + file_suffix))
    
    if show:
        print('There are {} files in directory {}'.format(n_files, directory))
        
    return n_files

In [11]:
def get_csvs(directory = "200_by_100/",
             file_prefix = "X_raw_disease_200_target_100",
             file_suffix = ".csv"):
    
    n_files = dir_stats(show = True,
                        directory = directory,
                        file_prefix = file_prefix,
                        file_suffix = file_suffix)

    if n_files == 0:
        done_diseases = []
        done_targets = []
    else:
        for i in range(n_files):
            if i == 0:
                df = pd.read_csv(directory + file_prefix + str(i) + file_suffix)
            else:
                tmp_df = pd.read_csv(directory + file_prefix + str(i) + file_suffix)
                df = pd.concat([df, tmp_df])
        
        done_diseases = df.index.tolist()
        done_targets = df.columns.tolist()
        done_targets.remove('COVID')
    
    return done_diseases, done_targets, n_files

In [12]:
def create_lists(show = False,
                 disease_ids = prediction_extended.efo_id,
                 target_ids = targets.ensembl_id,
                 n_diseases = 200,
                 n_targets = 100):

    disease_list = np.random.choice(list(disease_ids.tolist()), n_diseases).tolist()
    target_list = np.random.choice(list(target_ids.tolist()), n_targets).tolist()

    disease_list.sort()
    target_list.sort()
    
    if show:
        print('Number of diseases : ', n_diseases)
        print('Number of targets :  ', n_targets)

    return disease_list, target_list

In [13]:
def update_lists(done_diseases: list, done_targets: list):
    
    disease_ids = prediction_extended[~prediction_extended.efo_id.isin(done_diseases)].efo_id
    target_ids = done_targets
    
    return disease_list, target_list

In [14]:
from opentargets import OpenTargetsClient
from time import time, ctime

def create_dataset(disease_list: list, target_list: list):
    
    n_diseases = len(disease_list)
    n_targets = len(target_list)

    X_raw = np.zeros((n_diseases,
                      n_targets + 1))

    ot = OpenTargetsClient()
    start_time = time()
    tmp_time = time()

    print('Start time : ', ctime())
    with parallel_backend('ipyparallel'):
        
        for disease_id, disease_n in zip(disease_list, range(n_diseases)):
            
            for target_id,target_n in zip(target_list, range(n_targets)):           
                search = ot.filter_associations(disease = disease_id,
                                                target = target_id,
                                                fields = ['association_score.overall',
                                                          'target.id',
                                                          'disease.id'])
                for i, r in enumerate(search):
                    if len(search) > 0 and r['disease']['id'] == disease_id:
                        X_raw[disease_n][target_n] = r['association_score']['overall']
                        #print(r['disease']['id'], r['target']['id'], r['association_score']['overall'])
                #if (target_n + 1) % 200 == 0:
                #    print('X_raw line creation : {:.2f} % done. '.format((target_n + 1) / len(target_list) * 100))
                #    print(target_n + 1, ' of ', len(target_list))
                #    print('Time : {:.2f} seconds.'.format(time() - tmp_time))
                #    tmp_time = time()
            X_raw[disease_n][n_targets] = prediction_extended.Score[disease_n]
            print('-----', disease_n + 1, ' of ', n_diseases)
            if (disease_n + 1) % 5 == 0:
                print('---Time : {:.2f} seconds.'.format(time() - start_time))
                print('---Time since last print : {:.2f} seconds.'.format(time() - tmp_time))
                tmp_time = time()
            
    print('End time : ', ctime())
    return X_raw

In [15]:
def print_dataset(df: pd.DataFrame,
                  disease_list: list,
                  target_list: list,
                  directory = "200_by_100/",
                  file_prefix = "X_raw_disease_200_target_100",
                  file_n = "0",
                  file_suffix = ".csv"):
    
    pd.DataFrame(df,
                 index = disease_list,
                 columns = target_list + ['COVID']
                ).to_csv(directory + file_prefix + file_n + file_suffix)

In [None]:
# MAIN

directory = "200_by_100/"
file_prefix = "X_raw_disease_200_target_100"
file_suffix = ".csv"

done_diseases, done_targets, n_files = get_csvs(directory = directory,
                                                file_prefix = file_prefix,
                                                file_suffix = file_suffix)

show = False
disease_ids = prediction_extended.efo_id
target_ids = targets.ensembl_id
n_diseases = 200
n_targets = 100

if n_files == 0:
    disease_list, target_list = create_lists(show = False,
                                             disease_ids = disease_ids,
                                             target_ids = target_ids,
                                             n_diseases = n_diseases,
                                             n_targets = n_targets)
else:
    disease_list, target_list = update_lists(done_diseases = done_diseases,
                                             done_targets = done_targets)

X_raw = create_dataset(disease_list, target_list)

print_dataset(df = X_raw,
              disease_list = disease_list,
              target_list = target_list,
              directory = directory,
              file_prefix = file_prefix,
              file_n = n_files,
              file_suffix = file_suffix)

There are 0 files in directory 200_by_100/


  self.api_specs = yaml.load(self.swagger_yaml)


Start time :  Fri May  8 18:23:48 2020
----- 1  of  200
----- 2  of  200
----- 3  of  200
----- 4  of  200
----- 5  of  200
---Time : 39.15 seconds.
---Time since last print : 39.15 seconds.
----- 6  of  200
----- 7  of  200
----- 8  of  200
----- 9  of  200
----- 10  of  200
---Time : 79.79 seconds.
---Time since last print : 40.64 seconds.
----- 11  of  200
----- 12  of  200
----- 13  of  200
----- 14  of  200
----- 15  of  200
---Time : 121.57 seconds.
---Time since last print : 41.78 seconds.
----- 16  of  200
----- 17  of  200
----- 18  of  200
----- 19  of  200
----- 20  of  200
---Time : 159.18 seconds.
---Time since last print : 37.61 seconds.
----- 21  of  200
----- 22  of  200
----- 23  of  200
----- 24  of  200
----- 25  of  200
---Time : 202.29 seconds.
---Time since last print : 43.11 seconds.
----- 26  of  200
----- 27  of  200
----- 28  of  200
----- 29  of  200
----- 30  of  200
---Time : 244.18 seconds.
---Time since last print : 41.89 seconds.
----- 31  of  200
----- 