In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import os
import pandas as pd

Showing expression of individual genes across perturbation groups, no gates

In [None]:
adata = sc.read("/mnt/sata2/Analysis_Alex_2/perturb1/final_filtered_on_leiden.h5ad")

In [None]:
corresponding_guide_rnas = ["sgCd19", "sgThy1", "sgCxcr3"]
P14s = adata[adata.obs["guide_rnas"].isin(corresponding_guide_rnas)]

new_X = np.array(P14s.X.copy())
P14s.X = None
P14s.X = new_X.copy().astype(np.float64)

sc.pp.normalize_total(P14s)
sc.pp.log1p(P14s)

P14s.obs.groupby("guide_rnas").size()

In [None]:
adata = P14s.copy()

In [None]:
savedir = "figures/overall_expression"

In [None]:
try:
    os.mkdir(savedir)
except FileExistsError:
    pass

In [None]:
def plot_gene_change(adata, gene, save_dir):
    indices = np.where(adata.var.index == gene)[0]
    from scipy.stats import ttest_ind

    means = []
    std_devs = []
    vals = []
    for i in np.unique(P14s.obs["guide_rnas"]):
        arr_id = np.where(P14s.obs["guide_rnas"].values == i)[0]
        values = P14s[:, indices].X.flatten()[arr_id]
        means.append(np.mean(values))
        std_devs.append(
            np.std(values, ddof=1)
        )  # Calculate standard deviation for sample
        vals.append(values)
    # Calculate the SEM by dividing the standard deviations by the square root of the sample size
    sems = [std / np.sqrt(len(arr_id)) for std in std_devs]

    variable_names = np.unique(P14s.obs["guide_rnas"])
    values = means
    # Perform pairwise t-tests and store p-values
    p_values = {}
    for i, value_i in enumerate(vals):
        for j, value_j in enumerate(vals):
            if i < j:
                _, p = ttest_ind(value_i, value_j)
                p_values[(i, j)] = p

    # Create a bar chart with error bars
    fig, ax = plt.subplots()
    bars = ax.bar(variable_names, values, yerr=sems, capsize=5)

    # Add asterisks for significant differences
    pvals = []
    significance_level = 0.05  # Adjust as needed
    for i, bar_i in enumerate(bars):
        for j, bar_j in enumerate(bars):
            if i < j:
                p_value = p_values[(i, j)]
                # print(p_value)
                pvals.append([i, j, p_value])
    pvals = pd.DataFrame(pvals, columns=["i", "j", "p_value"])
    pvals.to_csv(os.path.join(save_dir, f"{gene}_pvals.csv"), index=False)
    # Create a bar chart with error bars
    plt.bar(variable_names, values, yerr=sems, capsize=5)
    plt.xlabel("Guide RNA")
    plt.ylabel(f"Mean {gene} Expression")
    plt.title(f"{gene} mean expression in cells with each guide")
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.tight_layout()  # Adjust layout to not cut off labels
    plt.savefig(os.path.join(save_dir, f"panel_s1_{gene}_mean_expression.pdf"), dpi=400)
    plt.show()


for gene in ["Klrg1", "Cxcr3", "Thy1", "Gzma", "Gzmb", "Tcf7", "Il7r", "Il18", "Klf2"]:
    plot_gene_change(adata, gene, savedir)