# Preparation for LDSC mapping of GWAS results to HLCA cell types:

### Import modules

In [1]:
import scanpy as sc
import os
import pandas as pd

for pretty code formatting:

In [2]:
%load_ext lab_black

### Set paths and load files:

In [10]:
path_HLCA = "../../data/HLCA_core_h5ads/HLCA_v2.h5ad"
dir_DEA_results_coarse_cts = "../../results/DEAs/grouped_manann_vs_rest/"
dir_gene_sets_for_LDSC = "../../results/LDSC_GWAS_to_celltype/celltype_genesets/"  # directory where we will store the top 1000 genes for every cell type, will be used for LDSC GWAS analysis

In [5]:
adata = sc.read_h5ad(path_HLCA)

### Perform differential expression analysis and store in ldsc required format:

perform differential expression analysis, one-versus-all:

In [7]:
sc.tl.rank_genes_groups(
    adata, groupby="manual_ann_grouped", use_raw=False, method="wilcoxon"
)

generate a dictionary that maps our gene names to ensembl ids:

In [8]:
gene_name_to_ens_id = dict(zip(adata.var.index, adata.var.gene_ids))

write dea results, and write top 1000 genes per cell type for LDSC GWAS stratification later. Use ensembl ids rather than gene names.

In [11]:
for ct in adata.obs.manual_ann_grouped.unique():
    # extract dea dataframe and store to csv
    dea_df = sc.get.rank_genes_groups_df(adata, group=ct)
    # add ensembl ids
    dea_df["gene_ids"] = dea_df.names.map(gene_name_to_ens_id)
    # remove spaces from cell type name
    ct_no_spaces = ct.replace(" ", "_")
    #     write to csv
    dea_df.to_csv(
        os.path.join(dir_DEA_results_coarse_cts, f"{ct_no_spaces}_vs_rest.csv")
    )
    with open(
        os.path.join(dir_gene_sets_for_LDSC, f"{ct_no_spaces}.GeneSet"),
        "w+",
    ) as f:
        # exclude HLA genes, since they have funny LD patterns
        dea_df_filt = dea_df.loc[~dea_df.names.str.startswith("HLA-"), :]
        dea_df_final = dea_df_filt.sort_values(by="scores", ascending=False).iloc[:1000]
        # make sure these do not include genes with negative t statistics
        if (dea_df_final.scores < 0).any():
            raise ValueError(
                "Note that the top1000 of your DEA are partly lower-expressed genes! Exiting."
            )
        for ens_id in dea_df_final["gene_ids"].values:
            f.write(ens_id + "\n")

also store all genes to file, for LDSC control:

In [12]:
with open(os.path.join(dir_gene_sets_for_LDSC, "control.GeneSet"), "w+") as f:
    for gene_name, ens_id in zip(adata.var.index, adata.var.gene_ids):
        # exclude HLA genes
        if not gene_name.startswith("HLA-"):
            f.write(ens_id + "\n")