# Project

Import libraries

In [None]:
import pandas as pd
import numpy as np

Import parallel computing libraries

In [None]:
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import Parallel, parallel_backend, register_parallel_backend

In [None]:
c = Client(profile='default')
print('profile:', c.profile)
print("IDs:", c.ids) # Process id numbers
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

Read datasets

In [None]:
predictions = pd.read_csv('predicted_covid19_risk_factors.csv')
targets = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_target_list.csv.gz',
                          compression='gzip')
diseases = pd.read_csv('https://storage.googleapis.com/open-targets-data-releases/20.02/output/20.02_disease_list.csv.gz',
                           compression='gzip')

Explore predictions dataset

In [None]:
predictions.head()

Describe predictions dataset

In [None]:
predictions['Risk Factors'].describe()

In [None]:
diseases.head()

In [None]:
prediction_extended = predictions.merge(diseases,
                              how = 'left',
                              left_on = 'Risk Factors',
                              right_on = 'disease_full_name')[['efo_id', 'disease_full_name', 'Score']]
prediction_extended.sort_values(by = 'efo_id', inplace = True, ascending = True)
prediction_extended.reset_index(inplace=True, drop = True)
prediction_extended.head()

In [None]:
target_list = np.random.choice(list(targets.ensembl_id.tolist()), 100).tolist()
disease_list = np.random.choice(list(prediction_extended.efo_id.tolist()), 200).tolist()
#disease_list = prediction_extended.efo_id.tolist()

target_list.sort()
disease_list.sort()

print(len(target_list))
print(len(disease_list))

features = target_list + disease_list

In [None]:
X_raw = np.zeros((len(disease_list),
                  len(target_list) + 1))

from opentargets import OpenTargetsClient
from time import time, ctime

ot = OpenTargetsClient()
start_time = time()
tmp_time = time()

print('Start time : ', ctime())
with parallel_backend('ipyparallel'):
    for disease_id, disease_n in zip(disease_list, range(len(disease_list))):
        for target_id,target_n in zip(target_list, range(len(target_list))):           
            search = ot.filter_associations(disease = disease_id,
                                        target = target_id,
                                        fields = ['association_score.overall', 'target.id', 'disease.id'])
            for i, r in enumerate(search):
                if len(search) > 0 and r['disease']['id'] == disease_id:
                    X_raw[disease_n][target_n] = r['association_score']['overall']
                    #print(r['disease']['id'], r['target']['id'], r['association_score']['overall'])
            #if (target_n + 1) % 200 == 0:
            #    print('X_raw line creation : {:.2f} % done. '.format((target_n + 1) / len(target_list) * 100))
            #    print(target_n + 1, ' of ', len(target_list))
            #    print('Time : {:.2f} seconds.'.format(time() - tmp_time))
            #    tmp_time = time()
        X_raw[disease_n][len(target_list)] = prediction_extended.Score[disease_n]
        print('-----', disease_n + 1, ' of ', len(disease_list))
        if (disease_n + 1) % 5 == 0:
            print('---Time : {:.2f} seconds.'.format(time() - start_time))
            print('---Time since last print : {:.2f} seconds.'.format(time() - tmp_time))
            tmp_time = time()
            
print('End time : ', ctime())

In [None]:
pd.DataFrame(X_raw, index = disease_list, columns = target_list + ['COVID']).to_csv("test/X_raw_disease_200_target_100.csv")

In [None]:
pd.DataFrame(X_raw).head()