# Lists of cell lines and compounds

In [1]:
import pandas as pd
from LambdaZero.examples.drug_comb.utils import get_project_root
from LambdaZero.examples.drug_comb.datasets.drugcomb_data_score import DrugCombScoreNoPPI
from LambdaZero.examples.drug_comb.datasets.drugcomb_score_l1000_data import DrugCombScoreL1000NoPPI
import os

In [2]:
def drugs_for_cell_line(ds, cl):
    """
    List of most prevalent drugs for a given dataset and cell line
    """
    drugs_row_for_cell_line = ds._score_data[ds._score_data['cell_line_name'] == cl]['drug_row']
    drugs_col_for_cell_line = ds._score_data[ds._score_data['cell_line_name'] == cl]['drug_col']
    drugs_both_for_cell_line = pd.concat((drugs_row_for_cell_line, drugs_col_for_cell_line))
    
    return drugs_both_for_cell_line.value_counts()

## Load data

**Note**: we will access the data through the dataset objects and not the raw files, as lots of examples are not used in practice (no drug CID for one of the drugs etc)

In [3]:
dataset = DrugCombScoreNoPPI()

Dataset loaded.
	 273550 drug comb experiments among 2699 drugs
	 fingeprints with radius 4 and nbits 1024
	 566781 drug target interactions
	 0 prot-prot interactions


We need to reprocess the dataset in order to get the score dataframe

In [4]:
dataset.process()

Processing drug nodes..
Processing protein nodes..
Processing drug protein edges..
Processing drug drug interaction edges..


## Load data with L1000

In [6]:
datasetl1000 = DrugCombScoreL1000NoPPI()

Dataset loaded.
	 3982 drug comb experiments among 1387 drugs
	 fingeprints with radius 4 and nbits 1024
	 388058 drug target interactions
	 0 prot-prot interactions
	 3982 pairs of expression profiles


In [7]:
datasetl1000.process()

L1000 dataset loaded.
Processing drug nodes..
Processing protein nodes..
Processing drug protein edges..
Processing drug drug interaction edges


## Ranked list of cell lines

The cell lines are ranked according to the number of examples tested on each cell line in our dataset

In DrugComb:

In [9]:
dataset._score_data['cell_line_name'].value_counts().head(n=20)

KBM-7          29889
NCIH23          4533
SW-620          4531
HT29            4520
HCT116          4509
SK-OV-3         4479
UACC62          4454
NCI-H460        4451
T-47D           4430
OVCAR3          4379
DIPG25          4370
ACHN            2890
OVCAR-5         2886
SF-268          2884
MCF7            2884
NCI/ADR-RES     2883
SN12C           2882
HCT-15          2882
SNB-19          2880
UACC-257        2879
Name: cell_line_name, dtype: int64

In DrugComb-inter-L1000:

In [10]:
datasetl1000._score_data['cell_line_name'].value_counts().head(n=20)

HT29     1334
MCF7     1026
A375      680
A549      462
VCAP      440
LNCAP      40
Name: cell_line_name, dtype: int64

In [11]:
# Check how many example we have in drugcomb for the 6 cell lines from drugcomb-inter-l1000
for cell_line in datasetl1000._score_data['cell_line_name'].unique():
    print(cell_line, dataset._score_data['cell_line_name'].value_counts()[cell_line])

A375 1652
HT29 4520
LNCAP 1596
VCAP 1596
MCF7 2884
A549 2855


**Top**: list of most prevalent cell lines in the DrugComb dataset

**Bottom**: list of most prevalent cell lines in the intersction between DrugComb and L1000

**HT29** - **MCF7** - **A549** look like good candidates for drugcomb+l1000 but also for drugcomb alone

**KBM-7** is by far the most frequent cell line in drugcomb (alone)

**Note**: we might want to use DrugComb-inter-L1000 even if it's smaller, as we get a better fit on the data in practice. Also, we may better see the benefits of active learning when our model has not fitted perfectly the cell line.

## List of Compounds in DrugComb-inter-L1000

For each cell line, plot the most frequent Drugs

In [12]:
drugs_for_cell_line(datasetl1000, 'HT29')

DASATINIB            111
SUNITINIB            111
LAPATINIB            111
ZOLINZA              108
SORAFENIB            107
TEMOZOLOMIDE         107
BORTEZOMIB           104
5-FU                  97
SN-38                 96
MK-2206               84
AZD1775               80
BEZ-235               76
ERLOTINIB             76
MK-5108               72
ABT-888               72
PD325901              72
DINACICLIB            72
DOXORUBICIN           60
METFORMIN             60
VEMURAFENIB           48
CRIZOTINIB            47
RALOXIFENE            40
ANASTROZOLE           39
VANDETANIB            39
VISMODEGIB            39
LENALIDOMIDE          37
MITOXANTRONE          37
GEFITINIB             37
CHLORAMBUCIL          37
MEGESTROL ACETATE     37
ALTRETAMINE           37
THIOGUANINE           37
ROMIDEPSIN            36
NILOTINIB             36
MERCAPTOPURINE        35
IMATINIB              35
LETROZOLE             35
IMIQUIMOD             35
IFOSFAMIDE            34
IXABEPILONE           34


In [13]:
drugs_for_cell_line(datasetl1000, 'MCF7')

VEMURAFENIB                 60
CRIZOTINIB                  59
RALOXIFENE                  50
VANDETANIB                  49
AXITINIB                    49
VISMODEGIB                  49
ANASTROZOLE                 47
MEGESTROL ACETATE           46
5-FU                        46
EXEMESTANE                  46
MITOXANTRONE                46
GEFITINIB                   46
THIOGUANINE                 46
DEXRAZOXANE                 46
LENALIDOMIDE                46
ALTRETAMINE                 46
CHLORAMBUCIL                46
NILOTINIB                   46
ZOLINZA                     45
SN-38                       45
MITOTANE                    45
IFOSFAMIDE                  44
TRETINOIN                   44
IMATINIB                    44
IMIQUIMOD                   44
THALIDOMIDE                 44
BORTEZOMIB                  44
DASATINIB                   44
CELECOXIB                   44
METHOXSALEN                 44
THIOTEPA                    44
HYDROXYUREA                 44
SUNITINI

In [14]:
drugs_for_cell_line(datasetl1000, 'A549')

VEMURAFENIB          41
CRIZOTINIB           40
RALOXIFENE           34
VANDETANIB           33
AXITINIB             33
NILOTINIB            31
MITOXANTRONE         31
ALTRETAMINE          31
CHLORAMBUCIL         31
MEGESTROL ACETATE    31
GEFITINIB            31
EXEMESTANE           31
DEXRAZOXANE          31
ZOLINZA              30
ANASTROZOLE          30
MITOTANE             30
SUNITINIB            29
LETROZOLE            29
THALIDOMIDE          29
IMATINIB             29
TRETINOIN            29
LAPATINIB            29
IMIQUIMOD            29
SORAFENIB            29
DASATINIB            29
METHOXSALEN          29
TEMOZOLOMIDE         29
THIOTEPA             29
CELECOXIB            29
METHOTREXATE         28
dtype: int64

Let us check how many drugs are tested on all the three main cell lines:

In [15]:
all_drugs = None
for cell_line in ['HT29', 'MCF7', 'A549']:
    
    if all_drugs is None:
        all_drugs = set(drugs_for_cell_line(datasetl1000, cell_line).index)
    
    else:
         all_drugs =  all_drugs.intersection(drugs_for_cell_line(datasetl1000, cell_line).index)
            
print(all_drugs)
print(len(all_drugs))

{'CELECOXIB', 'LETROZOLE', 'CRIZOTINIB', 'VEMURAFENIB', 'SUNITINIB', 'MITOXANTRONE', 'NILOTINIB', 'GEFITINIB', 'RALOXIFENE', 'ANASTROZOLE', 'ZOLINZA', 'THALIDOMIDE', 'MEGESTROL ACETATE', 'LAPATINIB', 'IMIQUIMOD', 'IMATINIB', 'DASATINIB', 'VANDETANIB', 'METHOXSALEN', 'CHLORAMBUCIL', 'SORAFENIB', 'ALTRETAMINE', 'TEMOZOLOMIDE'}
23


## List of Compounds in DrugComb alone

In [22]:
drugs_for_cell_line(dataset, 'HT29').head(40)

DASATINIB            203
TEMOZOLOMIDE         202
LAPATINIB            199
SUNITINIB            199
ZOLINZA              196
SORAFENIB            192
BORTEZOMIB           187
SN-38                157
5-FU                 157
MITOMYCINE           155
CYCLOPHOSPHAMIDE     151
METHOTREXATE         148
PACLITAXEL           146
MK-8776              144
ADM HYDROCHLORIDE    140
MK-2206              136
MK-4827              132
AZD1775              132
BEZ-235              124
ERLOTINIB            124
MK-8669              124
MK-5108              120
GELDANAMYCIN         120
PD325901             120
ABT-888              120
DINACICLIB           120
CRIZOTINIB            95
VEMURAFENIB           93
RALOXIFENE            83
ABIRATERONE           80
VISMODEGIB            80
VANDETANIB            80
AXITINIB              80
ANASTROZOLE           80
CABAZITAXEL           78
AZACYTIDINE           78
NILOTINIB             78
CHLORAMBUCIL          77
URACIL MUSTARD        77
EXEMESTANE            77


In [23]:
drugs_for_cell_line(dataset, 'MCF7').head(40)

ADM HYDROCHLORIDE      140
CRIZOTINIB              95
VEMURAFENIB             94
RALOXIFENE              83
AXITINIB                80
CABAZITAXEL             80
ABIRATERONE             80
VISMODEGIB              80
VANDETANIB              80
ANASTROZOLE             79
VINBLASTINE SULFATE     78
AZACYTIDINE             78
ROMIDEPSIN              78
SN-38                   78
GEFITINIB               77
ALTRETAMINE             77
ZOLINZA                 77
LENALIDOMIDE            77
NILOTINIB               77
LOMUSTINE               77
THIOGUANINE             77
MEGESTROL ACETATE       77
CHLORAMBUCIL            77
TENIPOSIDE              77
MITOXANTRONE            77
EXEMESTANE              77
DEXRAZOXANE             77
URACIL MUSTARD          77
5-FU                    77
PEMETREXED              77
DACARBAZINE             76
ANTIBIOTIC AD 32        76
MITOMYCINE              75
TRETINOIN               75
DASATINIB               75
THIOTEPA                75
THALIDOMIDE             75
C

In [24]:
drugs_for_cell_line(dataset, 'A549').head(40)

ADM HYDROCHLORIDE          135
CRIZOTINIB                  95
VEMURAFENIB                 94
RALOXIFENE                  83
ABIRATERONE                 80
AXITINIB                    80
VANDETANIB                  80
VISMODEGIB                  79
CABAZITAXEL                 78
ANASTROZOLE                 78
VINBLASTINE SULFATE         78
MEGESTROL ACETATE           77
LENALIDOMIDE                77
THIOGUANINE                 77
DEXRAZOXANE                 77
DACARBAZINE                 77
ALTRETAMINE                 77
GEFITINIB                   77
ZOLINZA                     77
CHLORAMBUCIL                77
5-FU                        77
MITOXANTRONE                77
SN-38                       77
EXEMESTANE                  77
URACIL MUSTARD              77
LOMUSTINE                   77
TENIPOSIDE                  77
PEMETREXED                  77
AZACYTIDINE                 76
NILOTINIB                   76
ROMIDEPSIN                  76
FULVESTRANT                 75
BLEOMYCI

In [25]:
drugs_for_cell_line(dataset, 'KBM-7').head(40)

CLOFARABINE                      486
DESOXYCORTICOSTERONE PIVALATE    245
5-FU                             244
ZALEPLON                         244
FLUMAZENIL                       244
IMATINIB                         244
MIGLITOL                         244
ABACAVIR                         244
PARGYLINE                        244
ILOPROST                         244
SPIRONOLACTONE                   244
FELBAMATE                        244
DIGITOXIN                        244
ULIPRISTAL ACETATE               244
BOSENTAN                         244
VINBLASTINE                      244
METHYLERGONOVINE                 244
AZACITIDINE                      244
SULFAMETER                       244
PHENPROCOUMON                    244
OXYPHENBUTAZONE                  244
BUSPIRONE                        244
MERCAPTOPURINE                   244
DRONEDARONE                      244
PROBUCOL                         244
LUBIPROSTONE                     244
TETRABENAZINE                    244
E

Let us check how many drugs are tested on all the four main cell lines:

In [20]:
all_drugs = None
for cell_line in ['HT29', 'MCF7', 'A549', 'KBM-7']:
    
    if all_drugs is None:
        all_drugs = set(drugs_for_cell_line(dataset, cell_line).index)
    
    else:
         all_drugs =  all_drugs.intersection(drugs_for_cell_line(dataset, cell_line).index)
            
print(all_drugs)
print(len(all_drugs))

{'CARMUSTINE', '5-FU', 'SUNITINIB', 'BORTEZOMIB', 'MITOXANTRONE', 'GEFITINIB', 'ANASTROZOLE', 'BUSULFAN', 'ZOLINZA', 'THALIDOMIDE', 'IMATINIB', 'ALLOPURINOL', 'HYDROXYUREA', 'METHOTREXATE', 'SN-38', 'CHLORAMBUCIL', 'ALTRETAMINE', 'SORAFENIB', 'TEMOZOLOMIDE', 'THIOTEPA', 'MERCAPTOPURINE', 'MITOTANE'}
22
