In [178]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

## Read data

In [23]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()
de_data_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


## Divide into train and test

In [50]:
# Cell types where all (cell_type, sm) pairs will be used for training
train_only_cell_types     = ["T cells CD4+", "T cells CD8+", "T regulatory cells"]
# Cell types where only some (cell_type, sm) pairs will be used for training
train_and_test_cell_types = ["B cells", "Myeloid cells", "NK cells"]

In [68]:
# Create a dict mapping cell_name -> list of sm given for cell_name
sm_names_by_cell_type = de_data_train.groupby("cell_type")["sm_name"].unique().to_dict()
# Get list of small molecules given for cell types with a reduced set of (cell_type, sm) pairs
train_and_test_sm = sm_names_by_cell_type["B cells"]

In [83]:
# For cell types where only some (cell_type, sm) pairs will be used for training
# Choose which small molecules will be used for training and which for test
num_b_sm       = len(sm_names_by_cell_type["B cells"])
num_myeloid_sm = len(sm_names_by_cell_type["Myeloid cells"])
num_nk_sm      = len(sm_names_by_cell_type["NK cells"])

b_cell_train       = sm_names_by_cell_type["B cells"][:num_b_sm//2]
myeloid_cell_train = sm_names_by_cell_type["Myeloid cells"][:num_myeloid_sm//2]
nk_cell_train      = sm_names_by_cell_type["NK cells"][:num_nk_sm//2]

b_cell_test       = sm_names_by_cell_type["B cells"][num_b_sm//2:]
myeloid_cell_test = sm_names_by_cell_type["Myeloid cells"][num_myeloid_sm//2:]
nk_cell_test      = sm_names_by_cell_type["NK cells"][num_nk_sm//2:]

In [118]:
# Create training combinations with all (cell_type, sm) pairs for train only cell types
training_combinations = dict((cell_type, sm_names_by_cell_type[cell_type]) for cell_type in train_only_cell_types)

In [119]:
# Include training (cell_type, sm) pairs from train_test cell types
training_combinations["B cells"] = b_cell_train
training_combinations["Myeloid cells"] = myeloid_cell_train
training_combinations["NK cells"] = nk_cell_train

In [120]:
# Create testing combinations
testing_combinations = {}
testing_combinations["B cells"] = b_cell_test
testing_combinations["Myeloid cells"] = myeloid_cell_test
testing_combinations["NK cells"] = nk_cell_test

In [121]:
training_combinations

{'T cells CD4+': array(['Clotrimazole', 'Mometasone Furoate', 'Idelalisib', 'Vandetanib',
        'Bosutinib', 'Ceritinib', 'Lamivudine', 'Crizotinib',
        'Cabozantinib', 'Flutamide', 'Dasatinib', 'Selumetinib',
        'Trametinib', 'ABT-199 (GDC-0199)', 'Oxybenzone', 'Vorinostat',
        'Raloxifene', 'Linagliptin', 'Lapatinib', 'Canertinib',
        'Disulfiram', 'Vardenafil', 'Palbociclib', 'Ricolinostat',
        'Dabrafenib', 'Proscillaridin A;Proscillaridin-A', 'IN1451',
        'Ixabepilone', 'CEP-18770 (Delanzomib)', 'RG7112', 'MK-5108',
        'Resminostat', 'IMD-0354', 'Alvocidib', 'LY2090314',
        'Methotrexate', 'LDN 193189', 'Tacalcitol', 'Colchicine', 'R428',
        'TL_HRAS26', 'BMS-387032', 'CGP 60474', 'TIE2 Kinase Inhibitor',
        'PD-0325901', 'Isoniazid', 'GSK-1070916', 'Masitinib',
        'Saracatinib', 'CC-401', 'Decitabine', 'Ketoconazole',
        'HYDROXYUREA', 'BAY 61-3606', 'Navitoclax', 'Porcn Inhibitor III',
        'GW843682X', 'Prednisolo

In [122]:
testing_combinations

{'B cells': array(['Porcn Inhibitor III', 'Belinostat', 'Foretinib', 'MLN 2238',
        'Penfluridol', 'Dactolisib', 'O-Demethylated Adapalene',
        'Oprozomib (ONX 0912)', 'CHIR-99021'], dtype=object),
 'Myeloid cells': array(['Porcn Inhibitor III', 'Belinostat', 'Foretinib', 'MLN 2238',
        'Penfluridol', 'Dactolisib', 'O-Demethylated Adapalene',
        'Oprozomib (ONX 0912)', 'CHIR-99021'], dtype=object),
 'NK cells': array(['AZD4547', 'Foretinib', 'Tivozanib', 'Quizartinib',
        'IKK Inhibitor VII', 'UNII-BXU45ZH6LI', 'Chlorpheniramine',
        'Tivantinib', 'CEP-37440', 'TPCA-1', 'AZ628', 'OSI-930', 'AZD3514',
        'Vanoxerine', 'PF-03814735', 'MLN 2238', 'Dovitinib', 'K-02288',
        'Midostaurin', 'I-BET151', 'STK219801', 'PRT-062607', 'AT 7867',
        'Sunitinib', 'Penfluridol', 'BMS-536924', 'Perhexiline',
        'BI-D1870', 'FK 866', 'Mubritinib (TAK 165)', 'Doxorubicin',
        'Pomalidomide', 'Colforsin', 'Phenylbutazone', 'Protriptyline',
        'B

In [133]:
# Convert into (cell_type, sm) pairs
training_pairs = set({})
for cell_type in training_combinations.keys():
    for sm in training_combinations[cell_type]:
        training_pairs.add(cell_type+", "+sm)

testing_pairs = set({})
for cell_type in testing_combinations.keys():
    for sm in testing_combinations[cell_type]:
        testing_pairs.add(cell_type+", "+sm)

list(training_pairs)[:10]

['T regulatory cells, IMD-0354',
 'T cells CD8+, Dasatinib',
 'T cells CD8+, Dactolisib',
 'T cells CD4+, Vardenafil',
 'T cells CD4+, Dabrafenib',
 'T cells CD4+, Belinostat',
 'T regulatory cells, Scriptaid',
 'T cells CD8+, GLPG0634',
 'T regulatory cells, BMS-536924',
 'T cells CD4+, BX 912']

In [134]:
all_column_names = de_data_train.columns
gene_names = all_column_names[5:]

In [135]:
de_data_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [137]:
cell_sm_gene_tuples = de_data_train.melt(id_vars=["cell_type", "sm_name"], value_vars=gene_names, var_name="gene", value_name="DE")

In [138]:
cell_sm_gene_tuples

Unnamed: 0,cell_type,sm_name,gene,DE
0,NK cells,Clotrimazole,A1BG,0.104720
1,T cells CD4+,Clotrimazole,A1BG,0.915953
2,T cells CD8+,Clotrimazole,A1BG,-0.387721
3,T regulatory cells,Clotrimazole,A1BG,0.232893
4,NK cells,Mometasone Furoate,A1BG,4.290652
...,...,...,...,...
11181549,T regulatory cells,Atorvastatin,ZZEF1,-0.979951
11181550,NK cells,Riociguat,ZZEF1,-0.064760
11181551,T cells CD4+,Riociguat,ZZEF1,-0.337082
11181552,T cells CD8+,Riociguat,ZZEF1,-1.733112


In [139]:
cell_sm_gene_tuples["cell_type_sm_pair"] = cell_sm_gene_tuples["cell_type"]+", "+cell_sm_gene_tuples["sm_name"]

In [140]:
cell_sm_gene_tuples

Unnamed: 0,cell_type,sm_name,gene,DE,cell_type_sm_pair
0,NK cells,Clotrimazole,A1BG,0.104720,"NK cells, Clotrimazole"
1,T cells CD4+,Clotrimazole,A1BG,0.915953,"T cells CD4+, Clotrimazole"
2,T cells CD8+,Clotrimazole,A1BG,-0.387721,"T cells CD8+, Clotrimazole"
3,T regulatory cells,Clotrimazole,A1BG,0.232893,"T regulatory cells, Clotrimazole"
4,NK cells,Mometasone Furoate,A1BG,4.290652,"NK cells, Mometasone Furoate"
...,...,...,...,...,...
11181549,T regulatory cells,Atorvastatin,ZZEF1,-0.979951,"T regulatory cells, Atorvastatin"
11181550,NK cells,Riociguat,ZZEF1,-0.064760,"NK cells, Riociguat"
11181551,T cells CD4+,Riociguat,ZZEF1,-0.337082,"T cells CD4+, Riociguat"
11181552,T cells CD8+,Riociguat,ZZEF1,-1.733112,"T cells CD8+, Riociguat"


In [151]:
training_tuples = cell_sm_gene_tuples[cell_sm_gene_tuples["cell_type_sm_pair"].isin(training_pairs)]
training_tuples

Unnamed: 0,cell_type,sm_name,gene,DE,cell_type_sm_pair
0,NK cells,Clotrimazole,A1BG,0.104720,"NK cells, Clotrimazole"
1,T cells CD4+,Clotrimazole,A1BG,0.915953,"T cells CD4+, Clotrimazole"
2,T cells CD8+,Clotrimazole,A1BG,-0.387721,"T cells CD8+, Clotrimazole"
3,T regulatory cells,Clotrimazole,A1BG,0.232893,"T regulatory cells, Clotrimazole"
4,NK cells,Mometasone Furoate,A1BG,4.290652,"NK cells, Mometasone Furoate"
...,...,...,...,...,...
11181548,T cells CD8+,Atorvastatin,ZZEF1,-1.000626,"T cells CD8+, Atorvastatin"
11181549,T regulatory cells,Atorvastatin,ZZEF1,-0.979951,"T regulatory cells, Atorvastatin"
11181551,T cells CD4+,Riociguat,ZZEF1,-0.337082,"T cells CD4+, Riociguat"
11181552,T cells CD8+,Riociguat,ZZEF1,-1.733112,"T cells CD8+, Riociguat"


In [152]:
testing_tuples = cell_sm_gene_tuples[cell_sm_gene_tuples["cell_type_sm_pair"].isin(testing_pairs)]
testing_tuples

Unnamed: 0,cell_type,sm_name,gene,DE,cell_type_sm_pair
233,B cells,Porcn Inhibitor III,A1BG,0.161235,"B cells, Porcn Inhibitor III"
234,Myeloid cells,Porcn Inhibitor III,A1BG,0.034656,"Myeloid cells, Porcn Inhibitor III"
275,B cells,Belinostat,A1BG,2.225073,"B cells, Belinostat"
276,Myeloid cells,Belinostat,A1BG,3.519499,"Myeloid cells, Belinostat"
309,NK cells,AZD4547,A1BG,0.679407,"NK cells, AZD4547"
...,...,...,...,...,...
11181534,NK cells,CGM-097,ZZEF1,0.149938,"NK cells, CGM-097"
11181538,NK cells,TGX 221,ZZEF1,0.037210,"NK cells, TGX 221"
11181542,NK cells,Azacitidine,ZZEF1,1.068045,"NK cells, Azacitidine"
11181546,NK cells,Atorvastatin,ZZEF1,-0.827473,"NK cells, Atorvastatin"


In [155]:
training_tuples = training_tuples.drop("cell_type_sm_pair", axis=1)
testing_tuples = testing_tuples.drop("cell_type_sm_pair", axis=1)

training_tuples

Unnamed: 0,cell_type,sm_name,gene,DE
0,NK cells,Clotrimazole,A1BG,0.104720
1,T cells CD4+,Clotrimazole,A1BG,0.915953
2,T cells CD8+,Clotrimazole,A1BG,-0.387721
3,T regulatory cells,Clotrimazole,A1BG,0.232893
4,NK cells,Mometasone Furoate,A1BG,4.290652
...,...,...,...,...
11181548,T cells CD8+,Atorvastatin,ZZEF1,-1.000626
11181549,T regulatory cells,Atorvastatin,ZZEF1,-0.979951
11181551,T cells CD4+,Riociguat,ZZEF1,-0.337082
11181552,T cells CD8+,Riociguat,ZZEF1,-1.733112


## Convert to numpy array for training some baseline models

In [166]:
# Convert cell_type, sm_name and gene to categorical types
training_tuples["cell_type"] = training_tuples["cell_type"].astype('category')
training_tuples["sm_name"] = training_tuples["sm_name"].astype('category')
training_tuples["gene"] = training_tuples["gene"].astype('category')

testing_tuples["cell_type"] = testing_tuples["cell_type"].astype('category')
testing_tuples["sm_name"] = testing_tuples["sm_name"].astype('category')
testing_tuples["gene"] = testing_tuples["gene"].astype('category')

In [173]:
training_numerical = training_tuples[["cell_type", "sm_name", "gene"]].apply(lambda x: x.cat.codes)
training_numerical["DE"] = training_tuples["DE"]

In [175]:
testing_numerical = testing_tuples[["cell_type", "sm_name", "gene"]].apply(lambda x: x.cat.codes)
testing_numerical["DE"] = testing_tuples["DE"]

In [181]:
training_numerical

Unnamed: 0,cell_type,sm_name,gene,DE
0,2,39,0,0.104720
1,3,39,0,0.915953
2,4,39,0,-0.387721
3,5,39,0,0.232893
4,2,84,0,4.290652
...,...,...,...,...
11181548,4,14,18210,-1.000626
11181549,5,14,18210,-0.979951
11181551,3,116,18210,-0.337082
11181552,4,116,18210,-1.733112


In [182]:
testing_numerical

Unnamed: 0,cell_type,sm_name,gene,DE
233,0,54,0,0.161235
234,1,54,0,0.034656
275,0,16,0,2.225073
276,1,16,0,3.519499
309,2,7,0,0.679407
...,...,...,...,...
11181534,2,19,18210,0.149938
11181538,2,65,18210,0.037210
11181542,2,10,18210,1.068045
11181546,2,9,18210,-0.827473


In [201]:
train = training_numerical.to_numpy()
test = testing_numerical.to_numpy()
train

array([[ 2.00000000e+00,  3.90000000e+01,  0.00000000e+00,
         1.04720474e-01],
       [ 3.00000000e+00,  3.90000000e+01,  0.00000000e+00,
         9.15953241e-01],
       [ 4.00000000e+00,  3.90000000e+01,  0.00000000e+00,
        -3.87720759e-01],
       ...,
       [ 3.00000000e+00,  1.16000000e+02,  1.82100000e+04,
        -3.37082043e-01],
       [ 4.00000000e+00,  1.16000000e+02,  1.82100000e+04,
        -1.73311173e+00],
       [ 5.00000000e+00,  1.16000000e+02,  1.82100000e+04,
        -7.50681338e-01]])

In [202]:
trainX, trainY = train[:, :3], train[:, 3]
testX, testY = test[:, :3], test[:, 3]

## Very dumb baseline

In [207]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [205]:
mean_regressor = DummyRegressor(strategy="mean")
mean_regressor.fit(trainX, trainY)

DummyRegressor()

In [208]:
predictions = mean_regressor.predict(testX)
mse = mean_squared_error(testY, predictions)
print(mse)

10.761522395105882


## Vaguely less dumb baseline

In [209]:
from sklearn.linear_model import LinearRegression

In [210]:
linear_regressor = LinearRegression().fit(trainX, trainY)

In [212]:
predictions = linear_regressor.predict(testX)
mse = mean_squared_error(testY, predictions)
print(mse)

10.693809068738759
