### Preparing train and test data splits for cell type annotation application

In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from collections import Counter
import pickle

In [4]:
rootdir = "/path/to/data/"

# collect panel of tissues to test
dir_list = []
for dir_i in os.listdir(rootdir):
    if ("results" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):
        dir_list += [dir_i]
dir_list

['pancreas',
 'liver',
 'blood',
 'lung',
 'spleen',
 'placenta',
 'colorectum',
 'kidney',
 'brain']

In [5]:
# dictionary of cell barcodes that passed QC filtering applied by Geneformer 
# to ensure same cells were used for comparison
with open(f"{rootdir}deepsort_filter_dict.pickle", "rb") as fp:
    filter_dict = pickle.load(fp)

# for example:
filter_dict["human_Placenta9595_data"]

Unnamed: 0,filter_pass,original_cell_id
0,0,C_1
1,1,C_2
2,0,C_3
3,1,C_4
4,0,C_5
...,...,...
9590,1,C_9591
9591,1,C_9592
9592,1,C_9593
9593,1,C_9594


In [None]:
for dir_name in tqdm(dir_list):

    df = pd.DataFrame()
    ct_df = pd.DataFrame(columns=["Cell","Cell_type"])
    
    subrootdir = f"{rootdir}{dir_name}/"
    for subdir, dirs, files in os.walk(subrootdir):
        for i in range(len(files)):
            file = files[i]
            if file.endswith("_data.csv"):
                file_prefix = file.replace("_data.csv","")
                sample_prefix = file.replace(".csv","")
                filter_df = filter_dict[sample_prefix]
                sample_to_analyze = list(filter_df[filter_df["filter_pass"]==1]["original_cell_id"])
                
                # collect data for each tissue
                df_i = pd.read_csv(f"{subrootdir}{file}", index_col=0)
                df_i = df_i[sample_to_analyze]
                df_i.columns = [f"{i}_{cell_id}" for cell_id in df_i.columns]
                df = pd.concat([df,df_i],axis=1)
                
                # collect cell type metadata
                ct_df_i = pd.read_csv(f"{subrootdir}{file_prefix}_celltype.csv", index_col=0)
                ct_df_i.columns = ["Cell","Cell_type"]
                ct_df_i["Cell"] = [f"{i}_{cell_id}" for cell_id in ct_df_i["Cell"]]
                ct_df = pd.concat([ct_df,ct_df_i],axis=0)
        
    # per published scDeepsort method, filter data for cell types >0.5% of data
    ct_counts = Counter(ct_df["Cell_type"])
    total_count = sum(ct_counts.values())
    nonrare_cell_types = [cell_type for cell_type,count in ct_counts.items() if count>(total_count*0.005)]
    nonrare_cells = list(ct_df[ct_df["Cell_type"].isin(nonrare_cell_types)]["Cell"])
    df = df[df.columns.intersection(nonrare_cells)]

    # split into 80/20 train/test data
    train, test = train_test_split(df.T, test_size=0.2)
    train = train.T
    test = test.T  
    
    # save filtered train/test data
    train.to_csv(f"{subrootdir}{dir_name}_filtered_data_train.csv")
    test.to_csv(f"{subrootdir}{dir_name}_filtered_data_test.csv")

    # split metadata into train/test data
    ct_df_train = ct_df[ct_df["Cell"].isin(list(train.columns))]
    ct_df_test = ct_df[ct_df["Cell"].isin(list(test.columns))]
    train_order_dict = dict(zip(train.columns,[i for i in range(len(train.columns))]))
    test_order_dict = dict(zip(test.columns,[i for i in range(len(test.columns))]))
    ct_df_train["order"] = [train_order_dict[cell_id] for cell_id in ct_df_train["Cell"]]
    ct_df_test["order"] = [test_order_dict[cell_id] for cell_id in ct_df_test["Cell"]]
    ct_df_train = ct_df_train.sort_values("order")
    ct_df_test = ct_df_test.sort_values("order")
    ct_df_train = ct_df_train.drop("order",axis=1)
    ct_df_test = ct_df_test.drop("order",axis=1)
    assert list(ct_df_train["Cell"]) == list(train.columns)
    assert list(ct_df_test["Cell"]) == list(test.columns)
    train_labels = list(Counter(ct_df_train["Cell_type"]).keys())
    test_labels = list(Counter(ct_df_test["Cell_type"]).keys())
    assert set(train_labels) == set(test_labels)
    
    # save train/test cell type annotations
    ct_df_train.to_csv(f"{subrootdir}{dir_name}_filtered_celltype_train.csv")
    ct_df_test.to_csv(f"{subrootdir}{dir_name}_filtered_celltype_test.csv")
                