In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

## Read data

In [None]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()
de_data_train

## Divide into train and test

In [None]:
# Cell types where all (cell_type, sm) pairs will be used for training
train_only_cell_types     = ["T cells CD4+", "T cells CD8+", "T regulatory cells"]
# Cell types where only some (cell_type, sm) pairs will be used for training
train_and_test_cell_types = ["B cells", "Myeloid cells", "NK cells"]

In [None]:
# Create a dict mapping cell_name -> list of sm given for cell_name
sm_names_by_cell_type = de_data_train.groupby("cell_type")["sm_name"].unique().to_dict()
# Get list of small molecules given for cell types with a reduced set of (cell_type, sm) pairs
train_and_test_sm = sm_names_by_cell_type["B cells"]

In [None]:
# For cell types where only some (cell_type, sm) pairs will be used for training
# Choose which small molecules will be used for training and which for test
num_b_sm       = len(sm_names_by_cell_type["B cells"])
num_myeloid_sm = len(sm_names_by_cell_type["Myeloid cells"])
num_nk_sm      = len(sm_names_by_cell_type["NK cells"])

b_cell_train       = sm_names_by_cell_type["B cells"][:num_b_sm//2]
myeloid_cell_train = sm_names_by_cell_type["Myeloid cells"][:num_myeloid_sm//2]
nk_cell_train      = sm_names_by_cell_type["NK cells"][:num_nk_sm//2]

b_cell_test       = sm_names_by_cell_type["B cells"][num_b_sm//2:]
myeloid_cell_test = sm_names_by_cell_type["Myeloid cells"][num_myeloid_sm//2:]
nk_cell_test      = sm_names_by_cell_type["NK cells"][num_nk_sm//2:]

In [None]:
# Create training combinations with all (cell_type, sm) pairs for train only cell types
training_combinations = dict((cell_type, sm_names_by_cell_type[cell_type]) for cell_type in train_only_cell_types)

In [None]:
# Include training (cell_type, sm) pairs from train_test cell types
training_combinations["B cells"] = b_cell_train
training_combinations["Myeloid cells"] = myeloid_cell_train
training_combinations["NK cells"] = nk_cell_train

In [None]:
# Create testing combinations
testing_combinations = {}
testing_combinations["B cells"] = b_cell_test
testing_combinations["Myeloid cells"] = myeloid_cell_test
testing_combinations["NK cells"] = nk_cell_test

In [None]:
training_combinations

In [None]:
testing_combinations

In [None]:
# Convert into (cell_type, sm) pairs
training_pairs = set({})
for cell_type in training_combinations.keys():
    for sm in training_combinations[cell_type]:
        training_pairs.add(cell_type+", "+sm)

testing_pairs = set({})
for cell_type in testing_combinations.keys():
    for sm in testing_combinations[cell_type]:
        testing_pairs.add(cell_type+", "+sm)

list(training_pairs)[:10]

In [None]:
all_column_names = de_data_train.columns
gene_names = all_column_names[5:]

In [None]:
de_data_train

In [None]:
cell_sm_gene_tuples = de_data_train.melt(id_vars=["cell_type", "sm_name"], value_vars=gene_names, var_name="gene", value_name="DE")

In [None]:
cell_sm_gene_tuples

In [None]:
cell_sm_gene_tuples["cell_type_sm_pair"] = cell_sm_gene_tuples["cell_type"]+", "+cell_sm_gene_tuples["sm_name"]

In [None]:
cell_sm_gene_tuples

In [None]:
training_tuples = cell_sm_gene_tuples[cell_sm_gene_tuples["cell_type_sm_pair"].isin(training_pairs)]
training_tuples

In [None]:
testing_tuples = cell_sm_gene_tuples[cell_sm_gene_tuples["cell_type_sm_pair"].isin(testing_pairs)]
testing_tuples

In [None]:
training_tuples = training_tuples.drop("cell_type_sm_pair", axis=1)
testing_tuples = testing_tuples.drop("cell_type_sm_pair", axis=1)

training_tuples

## Convert to numpy array for training some baseline models

In [None]:
# Convert cell_type, sm_name and gene to categorical types
training_tuples["cell_type"] = training_tuples["cell_type"].astype('category')
training_tuples["sm_name"] = training_tuples["sm_name"].astype('category')
training_tuples["gene"] = training_tuples["gene"].astype('category')

testing_tuples["cell_type"] = testing_tuples["cell_type"].astype('category')
testing_tuples["sm_name"] = testing_tuples["sm_name"].astype('category')
testing_tuples["gene"] = testing_tuples["gene"].astype('category')

In [None]:
training_numerical = training_tuples[["cell_type", "sm_name", "gene"]].apply(lambda x: x.cat.codes)
training_numerical["DE"] = training_tuples["DE"]

In [None]:
testing_numerical = testing_tuples[["cell_type", "sm_name", "gene"]].apply(lambda x: x.cat.codes)
testing_numerical["DE"] = testing_tuples["DE"]

In [None]:
training_numerical

In [None]:
testing_numerical

In [None]:
train = training_numerical.to_numpy()
test = testing_numerical.to_numpy()
train

In [None]:
trainX, trainY = train[:, :3], train[:, 3]
testX, testY = test[:, :3], test[:, 3]

## Very dumb baseline

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [None]:
mean_regressor = DummyRegressor(strategy="mean")
mean_regressor.fit(trainX, trainY)

In [None]:
train_mse = mean_squared_error(trainY, mean_regressor.predict(trainX))
test_mse = mean_squared_error(testY, mean_regressor.predict(testX))
print(f"Train MSE: {train_mse}, Test MSE: {test_mse}")

## Vaguely less dumb baselines

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_regressor = LinearRegression().fit(trainX, trainY)

In [None]:
train_mse = mean_squared_error(trainY, linear_regressor.predict(trainX))
test_mse = mean_squared_error(testY, linear_regressor.predict(testX))
print(f"Train MSE: {train_mse}, Test MSE: {test_mse}")

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_regressor = Ridge(alpha=10).fit(trainX, trainY)
train_mse = mean_squared_error(trainY, ridge_regressor.predict(trainX))
test_mse = mean_squared_error(testY, ridge_regressor.predict(testX))
print(f"Train MSE: {train_mse}, Test MSE: {test_mse}")