### We used Geneformer to inform our manual celltype annotation of the replicate data, and tell us which cells were Epithelial and Stromal classes for axes classification

In [None]:
import sys

sys.path.append("/home/amonell/Geneformer")
from geneformer import TranscriptomeTokenizer
import numpy as np
import os

### Put the path to the reference data

In [None]:
import scanpy as sc

ad = sc.read(
    "/mnt/sata1/Analysis_Alex/timecourse_final/analysis/cleaned/final_celltyped_and_axes.h5ad"
)

In [None]:
import pandas as pd

pd.DataFrame(ad.var.index.tolist()).to_csv(
    r"/mnt/sata1/Analysis_Alex/Geneformer/adcsv.csv", index=None
)

### Converting gene names to human homolog ensembl ids
We provide the following file in the current directory

In [None]:
ensembl = pd.read_csv(
    "gProfiler_hsapiens_9-18-2023_2-31-49 PM.csv"
).drop_duplicates("initial_alias")

In [None]:
ad.var["ensembl_id"] = [i for i in ensembl["converted_alias"] if not pd.isna(i)][
    : len(ad.var.index)
]
ad.var.index = ad.var.ensembl_id.tolist()
ad.obs["n_counts"] = np.array(np.sum(ad.X, axis=1)).flatten()
ad.obs["organ_major"] = "SI"

In [None]:
os.mkdir("/mnt/sata1/Analysis_Alex/Geneformer/loom_xenium")

In [None]:
ad.write_loom("/mnt/sata1/Analysis_Alex/Geneformer/loom_xenium/ad.loom")

In [None]:
ad.obs["input_ids"] = [i for i in range(len(ad.obs.index.tolist()))]
d = {}
for i in ad.var.index.tolist():
    d[i] = True
import pickle

with open("/mnt/sata1/Analysis_Alex/Geneformer/ids.pkl", "wb") as w:
    pickle.dump(d, w)

In [None]:
tk = TranscriptomeTokenizer(
    {"Subtype": "cell_type", "organ_major": "organ_major"}, nproc=4
)
tk.tokenize_data(
    "/mnt/sata1/Analysis_Alex/Geneformer/loom_xenium",
    "/mnt/sata1/Analysis_Alex/Geneformer/loom_xenium/tokenized",
    "train_xenium",
)

### Tokenize datasets to annotate

In [None]:
import glob

ensembl = pd.read_csv(
    "gProfiler_hsapiens_9-18-2023_2-31-49 PM.csv"
).drop_duplicates("initial_alias")

for filename in glob.glob("/mnt/sata1/Analysis_Alex/timecourse_replicates/day*"):
    outname = os.path.basename(filename)
    try:
        os.mkdir("/mnt/sata1/Analysis_Alex/Geneformer/loom_" + outname)
        ad = sc.read(
            os.path.join(
                "/mnt/sata1/Analysis_Alex/timecourse_replicates",
                outname,
                "adatas/06_reference_mapped.h5ad",
            )
        )
        ad.var["ensembl_id"] = [
            i for i in ensembl["converted_alias"] if not pd.isna(i)
        ][: len(ad.var.index)]
        ad.var.index = ad.var.ensembl_id.tolist()
        ad.obs["n_counts"] = np.array(np.sum(ad.X, axis=1)).flatten()
        ad.obs["organ_major"] = "SI"
        ad.obs["Subtype"] = 0
        ad.write_loom(
            "/mnt/sata1/Analysis_Alex/Geneformer/loom_"
            + outname
            + "/"
            + outname
            + ".loom"
        )

        tk = TranscriptomeTokenizer(
            {"Subtype": "cell_type", "organ_major": "organ_major"}, nproc=4
        )
        tk.tokenize_data(
            "/mnt/sata1/Analysis_Alex/Geneformer/loom_" + outname,
            "/mnt/sata1/Analysis_Alex/Geneformer/loom_" + outname + "/tokenized",
            "train_" + outname,
        )
    except:
        print(outname)