# Project

Import libraries

In [1]:
import pandas as pd
import numpy as np

Import parallel computing libraries

In [2]:
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import Parallel, parallel_backend, register_parallel_backend

In [3]:
c = Client(profile='default')
print('profile:', c.profile)
print("IDs:", c.ids) # Process id numbers
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

profile: default
IDs: [0, 1, 2, 3]


Read datasets

In [4]:
predictions = pd.read_csv('predicted_covid19_risk_factors.csv')
targets = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_target_list.csv.gz',
                          compression='gzip')
diseases = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_disease_list.csv.gz',
                           compression='gzip')

Explore predictions dataset

In [5]:
predictions.head()

Unnamed: 0.1,Unnamed: 0,Risk Factors,Score
0,0,Platelet storage pool disease,0.852844
1,1,abnormality of the urinary system physiology,0.828951
2,2,Bronchiolitis,0.781315
3,3,endocrine system disease,0.774616
4,4,Hernia,0.766383


Describe predictions dataset

In [6]:
predictions['Risk Factors'].describe()

count                  7217
unique                 7217
top       Citrin deficiency
freq                      1
Name: Risk Factors, dtype: object

In [7]:
diseases.head()

Unnamed: 0,efo_id,disease_full_name,number_of_associations
0,EFO_1000984,inflammatory breast carcinoma,174
1,MONDO_0004093,esophageal basaloid carcinoma,1
2,EFO_0006352,laryngeal squamous cell carcinoma,796
3,EFO_1000514,Salivary Gland Adenosquamous Carcinoma,0
4,EFO_1001965,pharyngeal squamous cell carcinoma,428


In [8]:
#test = diseases.sort_values(by = 'efo_id')
#test.disease_full_name.tolist().index('inflammatory breast carcinoma')


In [9]:
prediction_extended = predictions.merge(diseases,
                              how = 'left',
                              left_on = 'Risk Factors',
                              right_on = 'disease_full_name')[['efo_id', 'disease_full_name', 'Score']]
prediction_extended.sort_values(by = 'efo_id', inplace = True, ascending = True)
prediction_extended.reset_index(inplace=True, drop = True)
prediction_extended.head()

Unnamed: 0,efo_id,disease_full_name,Score
0,DOID_0050890,synucleinopathy,0.001827
1,EFO_0000094,B-cell acute lymphoblastic leukemia,0.001435
2,EFO_0000095,chronic lymphocytic leukemia,0.002511
3,EFO_0000096,neoplasm of mature B-cells,0.003721
4,EFO_0000174,Ewing sarcoma,0.000201


In [10]:
prediction_extended.Score[prediction_extended.efo_id.tolist().index('EFO_0000094')]

0.0014345049858093262

In [11]:
target_list = np.random.choice(list(targets.ensembl_id.tolist()), 200).tolist()
disease_list = np.random.choice(list(prediction_extended.efo_id.tolist()), 200).tolist()

target_list.sort()
disease_list.sort()

print(len(target_list))
print(len(disease_list))

features = target_list + disease_list

200
200


In [None]:
X_raw = np.zeros((len(disease_list),
                  len(target_list) + 1))

from opentargets import OpenTargetsClient
from time import time

ot = OpenTargetsClient()
start_time = time()
tmp_time = time()

with parallel_backend('ipyparallel'):
    for disease_id, disease_n in zip(disease_list, range(len(disease_list))):
        for target_id,target_n in zip(target_list, range(len(target_list))):            
            search = ot.filter_associations(disease = disease_id,
                                        target = target_id,
                                        fields = ['association_score.overall', 'target.id', 'disease.id'])
            for i, r in enumerate(search):
                if len(search) > 0 and r['disease']['id'] == disease_id:
                    X_raw[disease_n][target_n] = r['association_score']['overall']
                    #print(r['disease']['id'], r['target']['id'], r['association_score']['overall'])
            #if (target_n + 1) % 200 == 0:
            #    print('X_raw line creation : {:.2f} % done. '.format((target_n + 1) / len(target_list) * 100))
            #    print(target_n + 1, ' of ', len(target_list))
            #    print('Time : {:.2f} seconds.'.format(time() - tmp_time))
            #    tmp_time = time()
        X_raw[disease_n][len(target_list)] = prediction_extended.Score[disease_n]
        print('-----X_raw creation : {:.2f} % done.'.format((disease_n + 1) / len(disease_list) * 100))
        print('-----', disease_n + 1, ' of ', len(disease_list))
        if (disease_n + 1) % 5 == 0:
            print('---Time : {:.2f} seconds.'.format(time() - start_time))
            print('---Time since last print : {:.2f} seconds.'.format(time() - tmp_time))
            tmp_time = time()

-----X_raw creation : 0.50 % done.
----- 1  of  200
-----X_raw creation : 1.00 % done.
----- 2  of  200
-----X_raw creation : 1.50 % done.
----- 3  of  200
-----X_raw creation : 2.00 % done.
----- 4  of  200
-----X_raw creation : 2.50 % done.
----- 5  of  200
---Time : 87.20 seconds.
---Time since last print : 87.20 seconds.
-----X_raw creation : 3.00 % done.
----- 6  of  200
-----X_raw creation : 3.50 % done.
----- 7  of  200
-----X_raw creation : 4.00 % done.
----- 8  of  200
-----X_raw creation : 4.50 % done.
----- 9  of  200
-----X_raw creation : 5.00 % done.
----- 10  of  200
---Time : 182.06 seconds.
---Time since last print : 94.85 seconds.
-----X_raw creation : 5.50 % done.
----- 11  of  200
-----X_raw creation : 6.00 % done.
----- 12  of  200
-----X_raw creation : 6.50 % done.
----- 13  of  200
-----X_raw creation : 7.00 % done.
----- 14  of  200
-----X_raw creation : 7.50 % done.
----- 15  of  200
---Time : 278.05 seconds.
---Time since last print : 96.00 seconds.
-----X_raw 

-----X_raw creation : 61.50 % done.
----- 123  of  200
-----X_raw creation : 62.00 % done.
----- 124  of  200
-----X_raw creation : 62.50 % done.
----- 125  of  200
---Time : 2350.93 seconds.
---Time since last print : 117.93 seconds.
-----X_raw creation : 63.00 % done.
----- 126  of  200
-----X_raw creation : 63.50 % done.
----- 127  of  200


In [None]:
pd.DataFrame(X_raw, index = disease_list, columns = disease_list + ['COVID']).to_csv("test/X_raw_short.csv")

In [None]:
pd.DataFrame(X_raw).head()