# Project

Import libraries

In [1]:
import pandas as pd
import numpy as np

Import parallel computing libraries

In [2]:
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import Parallel, parallel_backend, register_parallel_backend

In [3]:
c = Client(profile='default')
print('profile:', c.profile)
print("IDs:", c.ids) # Process id numbers
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

profile: default
IDs: [0, 1, 2, 3]


Read datasets

In [4]:
predictions = pd.read_csv('predicted_covid19_risk_factors.csv')
targets = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_target_list.csv.gz',
                          compression='gzip')
diseases = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_disease_list.csv.gz',
                           compression='gzip')

Explore predictions dataset

In [5]:
predictions.head()

Unnamed: 0.1,Unnamed: 0,Risk Factors,Score
0,0,Platelet storage pool disease,0.852844
1,1,abnormality of the urinary system physiology,0.828951
2,2,Bronchiolitis,0.781315
3,3,endocrine system disease,0.774616
4,4,Hernia,0.766383


Describe predictions dataset

In [6]:
predictions['Risk Factors'].describe()

count                                                  7217
unique                                                 7217
top       46,XX disorder of sex development induced by a...
freq                                                      1
Name: Risk Factors, dtype: object

In [7]:
diseases.head()

Unnamed: 0,efo_id,disease_full_name,number_of_associations
0,EFO_1000984,inflammatory breast carcinoma,174
1,MONDO_0004093,esophageal basaloid carcinoma,1
2,EFO_0006352,laryngeal squamous cell carcinoma,796
3,EFO_1000514,Salivary Gland Adenosquamous Carcinoma,0
4,EFO_1001965,pharyngeal squamous cell carcinoma,428


In [8]:
#test = diseases.sort_values(by = 'efo_id')
#test.disease_full_name.tolist().index('inflammatory breast carcinoma')


In [9]:
prediction_extended = predictions.merge(diseases,
                              how = 'left',
                              left_on = 'Risk Factors',
                              right_on = 'disease_full_name')[['efo_id', 'disease_full_name', 'Score']]
prediction_extended.sort_values(by = 'efo_id', inplace = True, ascending = True)
prediction_extended.reset_index(inplace=True, drop = True)
prediction_extended.head()

Unnamed: 0,efo_id,disease_full_name,Score
0,DOID_0050890,synucleinopathy,0.001827
1,EFO_0000094,B-cell acute lymphoblastic leukemia,0.001435
2,EFO_0000095,chronic lymphocytic leukemia,0.002511
3,EFO_0000096,neoplasm of mature B-cells,0.003721
4,EFO_0000174,Ewing sarcoma,0.000201


In [20]:
prediction_extended.Score[prediction_extended.efo_id.tolist().index('EFO_0000094')]

0.0014345049858093262

In [10]:
target_list = np.random.choice(list(targets.ensembl_id.tolist()), 1000).tolist()
disease_list = prediction_extended.efo_id.tolist()

target_list.sort()
disease_list.sort()

print(len(target_list))
print(len(disease_list))

features = target_list + disease_list

['ENSG00000002726', 'ENSG00000005884', 'ENSG00000007541', 'ENSG00000007952', 'ENSG00000008516', 'ENSG00000010072', 'ENSG00000013392', 'ENSG00000019144', 'ENSG00000035862', 'ENSG00000039523']
['DOID_0050890', 'EFO_0000094', 'EFO_0000095', 'EFO_0000096', 'EFO_0000174', 'EFO_0000178', 'EFO_0000180', 'EFO_0000181', 'EFO_0000182', 'EFO_0000183']
----------
1000
7217


In [None]:
X_raw = np.empty((len(disease_list),
                  len(target_list) + 1))

from opentargets import OpenTargetsClient
from time import time

ot = OpenTargetsClient()
start_time = time()
tmp_time = time()

with parallel_backend('ipyparallel'):
    for disease_id, disease_n in zip(disease_list, range(len(disease_list))):
        for target_id,target_n in zip(target_list, range(len(target_list))):            
            search = ot.filter_associations(disease = disease_id,
                                        target = target_id,
                                        fields = ['association_score.overall', 'target.id', 'disease.id'])
            for i, r in enumerate(search):
                if len(search) > 0 and r['disease']['id'] == disease_id:
                    X_raw[disease_n][target_n] = r['association_score']['overall']
                    #print(r['disease']['id'], r['target']['id'], r['association_score']['overall'])
                else:
                    X_raw[disease_n][target_n] = 0
            if (target_n + 1) % 500 == 0:
                print('X_raw line creation : {:.2f} % done. '.format((target_n + 1) / len(target_list) * 100))
                print(target_n + 1, ' of ', len(target_list))
                print('Time : {:.2f} seconds.'.format(time() - tmp_time))
                tmp_time = time()
        X_raw[disease_n][len(target_list)] = prediction_extended.Score[disease_n]
        print('-----X_raw creation : {:.2f} % done.'.format((disease_n + 1) / len(disease_list) * 100))
        print(disease_n + 1, ' of ', len(disease_list))
pd.DataFrame(X_raw).to_csv("test/X_raw.csv")

X_raw line creation : 50.00 % done. 
500  of  1000
Time : 5.63 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 5.74 seconds.
-----X_raw creation : 0.01 % done.
1  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 5.31 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 5.29 seconds.
-----X_raw creation : 0.03 % done.
2  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 5.68 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 5.77 seconds.
-----X_raw creation : 0.04 % done.
3  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 15.48 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 31.19 seconds.
-----X_raw creation : 0.06 % done.
4  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 46.45 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 31.62 seconds.
-----X_raw creation : 0.07 % done.
5  of  7217
X_raw line creation : 50.00 % d

X_raw line creation : 50.00 % done. 
500  of  1000
Time : 52.11 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 39.10 seconds.
-----X_raw creation : 0.60 % done.
43  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 54.84 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 35.46 seconds.
-----X_raw creation : 0.61 % done.
44  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 52.86 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 34.17 seconds.
-----X_raw creation : 0.62 % done.
45  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 53.73 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 36.84 seconds.
-----X_raw creation : 0.64 % done.
46  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 51.20 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 39.76 seconds.
-----X_raw creation : 0.65 % done.
47  of  7217
X_raw line creation 

X_raw line creation : 50.00 % done. 
500  of  1000
Time : 39.82 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 30.99 seconds.
-----X_raw creation : 1.18 % done.
85  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 48.81 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 33.47 seconds.
-----X_raw creation : 1.19 % done.
86  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 49.49 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 33.31 seconds.
-----X_raw creation : 1.21 % done.
87  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 49.37 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 30.85 seconds.
-----X_raw creation : 1.22 % done.
88  of  7217
X_raw line creation : 50.00 % done. 
500  of  1000
Time : 49.99 seconds.
X_raw line creation : 100.00 % done. 
1000  of  1000
Time : 30.52 seconds.
-----X_raw creation : 1.23 % done.
89  of  7217
X_raw line creation 