In [None]:
import pandas as pd 
import numpy as np 
import os
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import anndata 
import scanpy as sc 
from matplotlib.pyplot import rc_context
import seaborn as sns 
from itertools import zip_longest

import utils.DE_plotting_tools as plot_utils


In [None]:
import importlib 
importlib.reload(plot_utils)

In [None]:
# In all gene de. Look at the Olfr enriched in KO investigate if they are expressing Rhbdf2
# Additionally compare wtvsKO DE Olfrs with nasal seclution DE Olfrs
# Is Tsukahara nasal DE Olfr consistent with Santoro

In [None]:
DE_df[DE_df.symbol.isin(['Rhbdf2', 'Dlg2', 'Lrrc3b', 'Kirrel2', 'S100a5', 'Pcp4l1' ])][['symbol', 'logFC', 'FDR', 'PValue']]

In [None]:
# DE_df = pd.read_csv('../DE_out/Blobel-15045/DE_allgene_WTvsKO_n6.csv', index_col = 0 )
DE_df = pd.read_csv('../DE_out/WTvKO_ALL/DE_allgene_WTvsKO_ALL.csv', index_col = 0 )

wt_Olfrs = DE_df[(DE_df.symbol.str.contains('Olfr')) & 
                 (DE_df.logFC < 0) & (DE_df.FDR < 0.05)].symbol.values
ko_Olfrs = DE_df[(DE_df.symbol.str.contains('Olfr')) & 
                 (DE_df.logFC > 0) & (DE_df.FDR < 0.05)].symbol.values
diff_Olfrs = list(wt_Olfrs) + list(ko_Olfrs)
len(diff_Olfrs)

In [None]:
# Occlu_diff_Olfr contains Olfr that are differentially expressed either up or down in closed mouse nostrils. RNAseq 
Occlu_diff_Olfr = pd.read_csv('../files/Santoro_2020/Occlu_diff_Olfr.csv', index_col = 0 )
# Occlu_diff_Olfr.head()

### Compare Olfr between labels

In [None]:
"""
prints out value counts of each category to make venn manually in ppt
"""
Occlu_diff_Olfr = pd.read_csv('../files/Santoro_2020/Occlu_diff_Olfr.csv', index_col = 0 )
Occlu_diff_Olfr = Occlu_diff_Olfr.rename(columns={'id':'symbol'})

iR2 = pd.read_csv('../DE_out/WTvKO_ALL/DE_Olfr_WTvsKO_ALL.csv', index_col=0)
label_df = pd.merge(iR2, Occlu_diff_Olfr, how='outer', on = 'symbol')
label_df['iR2'] = label_df.apply(lambda row : 'KO+_Olfr' if ((row['logFC'] > 0) & (row['FDR_x'] <= 0.2)) else 'KO-_Olfr' if ((row['logFC'] < 0) & (row['FDR_x'] <= 0.2)) else 'na', axis=1)
df = label_df[['symbol', 'iR2', 'fold_diff']]

df['iR2'] = df.iR2.apply(lambda x : 'na' if x is np.nan else x)
df['fold_diff'] = df.fold_diff.apply(lambda x : 'na' if x is np.nan else x)
df['group'] = df.iR2 + '_' + df.fold_diff

# print(df.value_counts('group'))

In [None]:
"""
correlation of logFC between santoro_2020 open/close vs Blobel iR2 KO/WT
"""

Occlu_diff_Olfr = pd.read_csv('../files/Santoro_2020/Occlu_diff_Olfr.csv', index_col = 0 )
Occlu_diff_Olfr = Occlu_diff_Olfr.rename(columns={'id':'symbol'})

iR2 = pd.read_csv('../DE_out/WTvKO_ALL/DE_Olfr_WTvsKO_ALL.csv')
# iR2 = iR2[iR2.FDR < 0.05]
label_df = pd.merge(iR2, Occlu_diff_Olfr, how='inner', on = 'symbol')
# df = label_df[label_df.symbol.isin(intersect)]

plt.scatter(x = label_df['logFC'], 
            y = label_df['Log2(cl/op)'])

# Calculate the maximum absolute value among the minimum and maximum values of both axes
max_abs_value = max(abs(label_df['logFC'].min())*0.9, abs(label_df['logFC'].max())*1.1, 
                    abs(label_df['Log2(cl/op)'].min())*0.9, abs(label_df['Log2(cl/op)'].max())*1.1)
plt.xlim(-max_abs_value, max_abs_value)
plt.ylim(-max_abs_value, max_abs_value)

plt.axhline(y=0, color='grey', linestyle='--')
plt.axvline(x=0, color='grey', linestyle='--')
plt.gca().set_aspect('equal', adjustable='box')

plt.xlabel('logFC(ko/wt)')
plt.ylabel('Log2(cl/op)')

In [None]:
""" 
Since there are some Olfr not represented in Tsukahara adata's mOSN, likely due to rare expression Olfr.
Perform a check to see % of KO+_Olfr not expressed in mOSN vs randomly selected Olfr
"""
iR2 = pd.read_csv('../files/iR2_Olfr.csv', index_col = 0)
Occlu_diff_Olfr = pd.read_csv('../files/Santoro_2020/Occlu_diff_Olfr.csv', index_col = 0 )
Occlu_diff_Olfr = Occlu_diff_Olfr.rename(columns={'id':'symbol'})

umap_df = pd.read_csv('../output/WTvKO_ALL/tsukahara_occlusion/umap/umap_df.csv', index_col = 0)
import random

for ir2 in iR2.iR2.unique():
    present_list = [olfr in (umap_df.top_Olfr.unique()) for olfr in iR2[iR2.iR2 == ir2].symbol]
    print(f'{ir2}: n = {len(present_list)}, present_pct = {np.round(present_list.count(True) / len(present_list), 2)}')
for _diff in Occlu_diff_Olfr.fold_diff.unique():
    present_list = [olfr in (umap_df.top_Olfr.unique()) for olfr in Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == _diff].symbol]
    print(f'{_diff}: n = {len(present_list)}, present_pct = {np.round(present_list.count(True) / len(present_list), 2)}')

print("")

all_olfr = pd.read_csv('../DE_out/WTvKO_ALL/DE_Olfr_WTvsKO_ALL.csv')['symbol'].reset_index(drop = True)
random_average = []
iter = 100
for i in range(iter): 
    random_olfr = all_olfr[random.sample(range(len(all_olfr)), 30)]
    present_list = [olfr in (umap_df.top_Olfr.unique()) for olfr in random_olfr]
    # print(f'Random: n = {len(present_list)}, present_pct = {np.round(present_list.count(True) / len(present_list),2)}')
    random_average.append(present_list.count(True) / len(present_list))
print(f'Random average: iterations = {iter}, average = {np.round(np.mean(random_average), 2)}')


### Santoro Nares Occlusion diff Olfrs

##### Blobel analysis with Occlu label

In [None]:
# Volcano plot 

plot_df = pd.read_csv('../DE_out/Blobel-15045/DE_Olfr_WTvsKO_n6.csv', index_col = 0 )

fig = go.Figure()
temp = plot_df[~(plot_df.symbol.isin(Occlu_diff_Olfr.id))]
fig.add_trace(go.Scatter(x=temp['logFC'], 
                        y=-np.log10(temp['FDR']),
                        text=temp['symbol'],
                        mode='markers', 
                        name = 'NA',
                        marker=dict(size = 10, color = 'grey', opacity=0.3)))

temp = plot_df[plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Open+_Olfr'].id)]
fig.add_trace(go.Scatter(x=temp['logFC'], 
                        y=-np.log10(temp['FDR']),
                        text=temp['symbol'],
                        mode='markers', 
                        name = 'Occlu_down',
                        marker=dict(size = 10, color = '#990011', opacity=0.3)))

temp = plot_df[plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Close+_Olfr'].id)]
fig.add_trace(go.Scatter(x=temp['logFC'], 
                        y=-np.log10(temp['FDR']),
                        text=temp['symbol'],
                        mode='markers', 
                        name = 'Occlu_up',
                        marker=dict(size = 10, color = '#317773', opacity=0.3)))


# This lines disables auto-sizing of the y axis when toggling data
y_max = 0 
for f in fig.data:
    if f['y'].max() > y_max: 
        y_max = f['y'].max()
x_min = 0
x_max = 0 
for f in fig.data:
    if f['x'].min() < x_min: 
        x_min = f['x'].min()
    if f['x'].max() > x_max: 
        x_max = f['x'].max()
    if abs(x_min) > abs(x_max):
        x_max = abs(x_min)
        
fig.update_layout(xaxis_range = [(-x_max*1.1), (x_max*1.1)], 
                  yaxis_range =[-1, (y_max*1.1)])

# Add the horizontal line at y=0.5
fig.add_shape(type='line', x0=-10, x1=10,
                      y0=-np.log10(0.05), y1=-np.log10(0.05),
              line=dict(color='violet', width=3, dash='dash'))

fig.update_traces(
    textposition='top center',
    hovertemplate = '<b>%{text}</b>' + '<br>LogFC: %{x}'+ '<br>FDR: %{y}<br>')

fig.update_layout(
    title='Rhbdf2 DE',
    xaxis_title='logFC (KO/WT)',
    yaxis_title='FDR',
    autosize=True,
    template='simple_white'
)

fig.show()
# fig.write_html("../output/Blobel_15045/Santoro_occlusion//WTvsKO_SantoroOcclu.html")


In [None]:
# Box scatter plot 
plot_df = pd.read_csv('../DE_out/Blobel-15045/DE_Olfr_WTvsKO_n6.csv', index_col = 0 )
plot_df = pd.merge(plot_df, Occlu_diff_Olfr, 
                   left_on='symbol', right_on='id', how='left')
plot_df.loc[plot_df.fold_diff.isna(), 'fold_diff'] = 'na' # Replace nan with 'na' 
plot_df['fold_diff'] = pd.Categorical(plot_df['fold_diff'], ['Open+_Olfr', 'Close+_Olfr', 'na'])
plot_df = plot_df.sort_values('fold_diff')

# plot_df = plot_df[plot_df.FDR_x < 0.7]
plot_df = plot_df[plot_df.FDR_x < 0.7]

fig = px.box(plot_df, 
             x="fold_diff", 
             y="logFC", 
             color = 'fold_diff',
             points="all")

# manually assign color
manual_color = ['#990011','#317773', 'grey']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]


fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='LightPink')
# Adds invisible second x axis for horizontal line plotting 
# fig.layout.xaxis2 = go.layout.XAxis(overlaying='x', range=[0, 2], showticklabels=False)
# fig.add_scatter(x = [0, 2], y = [0, 0], 
#                 mode='lines', xaxis='x2', opacity = 0.3, 
#                 showlegend=False, line=dict(dash='dash', color = "violet", width = 2))

fig = plot_utils.add_p_value_annotation(fig, [[0,1],[1,2], [0,2]], test_type='ranksums')

fig.update_layout(
    yaxis = {'title' : ''},
    xaxis = {'title' : ''},
    title='<br>\
    <span style="font-size: 10px;"> </span>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=15,  # Set the font size here
    )
)

fig.show()
# fig.write_html("../output/Blobel_15045/Santoro_occlusion//WTvsKO_SantoroOcclu_logFC_box_FDR07.html")


In [None]:
# TODO do significant test between wt and ko within up,down,na groups 

In [None]:
# Box scatter plot 
plot_df = pd.read_csv('../DE_out/Blobel-15045/DE_Olfr_WTvsKO_n6.csv', index_col = 0 )
plot_df = pd.merge(plot_df, Occlu_diff_Olfr, 
                   left_on='symbol', right_on='id', how='left')
plot_df.loc[plot_df.fold_diff.isna(), 'fold_diff'] = 'na' # Replace nan with 'na' 
plot_df['fold_diff'] = pd.Categorical(plot_df['fold_diff'], ['Open+_Olfr', 'Close+_Olfr', 'na'])
plot_df = plot_df.sort_values(['fold_diff', 'logFC'])

plot_df = plot_df[plot_df.FDR_x < 0.7]

# Separate in groups for plotting and calculating significance 
for f in plot_df.fold_diff.unique():
    for i in ['WT', 'KO']:
        if i == 'WT': 
            plot_df.loc[(plot_df.fold_diff == f) & 
                        (plot_df.logFC < 0), 'group'] = f'{i}_{f}'
        else: 
            plot_df.loc[(plot_df.fold_diff == f) & 
                        (plot_df.logFC > 0), 'group'] = f'{i}_{f}'

fig = px.box(plot_df, 
             x="group", 
             y="logFC", 
             color = 'group',
             points="all")

# manually assign color
manual_color = ['#990011','#990011', '#317773', '#317773', 'grey', 'grey']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]


fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='LightPink')

fig = plot_utils.add_p_value_annotation(fig, [[0,1],[2,3], [4,5]], test_type='ranksums')

fig.update_layout(
    yaxis = {'title' : ''},
    xaxis = {'title' : ''},
    title='<br>\
    <span style="font-size: 10px;"> </span>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=15,  # Set the font size here
    )
)

fig.show()
# fig.write_html("../output/Blobel_15045/Santoro_occlusion//WTvsKO_SantoroOcclu_logFC_box_FDR07.html")


In [None]:
# Olfr count bar graph 

logfc_cutoff = 0
fdr_cutoff = 0.7

plot_df = pd.read_csv('../DE_out/Blobel-15045/DE_Olfr_WTvsKO_n6.csv', index_col = 0 )

mean_list = []
num_list = []
temp_df = plot_df[(plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Open+_Olfr'].id)) &
                  (plot_df.logFC <= -logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())
temp_df = plot_df[(plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Open+_Olfr'].id)) &
                  (plot_df.logFC >= logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())

temp_df = plot_df[(plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Close+_Olfr'].id)) &
                  (plot_df.logFC <= -logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())
temp_df = plot_df[(plot_df.symbol.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Close+_Olfr'].id)) &
                  (plot_df.logFC >= logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())

temp_df = plot_df[~(plot_df.symbol.isin(Occlu_diff_Olfr.id)) &
                  (plot_df.logFC <= -logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())
temp_df = plot_df[~(plot_df.symbol.isin(Occlu_diff_Olfr.id)) &
                  (plot_df.logFC >= logfc_cutoff) & 
                  (plot_df.FDR <= fdr_cutoff)][['No1', 'No2', 'No3', 'No4', 'No5', 'No6']]
num_list.append(len(temp_df))
mean_list.append(temp_df.sum().mean())


group_list = ['OccluDown_WT', 'OccluDown_KO', 'OccluUp_WT', 'OccluUp_KO', 'na_WT', 'na_KO']
color_list = ['#990011','#990011','#317773','#317773', 'grey', 'grey']

plot_df = pd.DataFrame(list(zip(group_list, 
                                color_list,
                                num_list, 
                                mean_list)), 
                       columns = ['group', 'color', 'n_olfr', 'mean'])

# Create a figure and plot it
fig = go.Figure()
fig.add_trace(
    go.Bar(x = plot_df['group'], 
           y = plot_df['mean'], 
           text = plot_df['n_olfr'],
           textposition='outside',
           marker=dict(
               color=plot_df['color'])
          )
)

# fig = plot_utils.add_p_value_annotation(fig, [[0,1]], test_type='ranksums')

fig.update_layout(yaxis_range =[0, (plot_df['mean'].max()*1.1)])

fig.update_layout(
    yaxis = {'title' : ''},
    xaxis = {'title' : ''},
    title='<br>\
    <span style="font-size: 10px;"> </span>',
    autosize=True,
    # width=1000,
    # height=800,
    template='simple_white',
    font=dict(
        size=15,  # Set the font size here
    )
)
fig.show()
# fig.write_html("../output/Blobel_15045/Santoro_occlusion//WTvsKO_SantoroOcclu_bar.html")


### Tsukahara Naris Occlusion 

In [None]:
adata = anndata.read_csv('../files/Tsukahara_2021/GSE173947_ChronicOccl_umi_counts.csv')
meta = pd.read_csv('../files/Tsukahara_2021/GSE173947_ChronicOccl_metadata.csv', index_col = 0 )
# In nares occlusion, R (occluded) L (open)DDD
meta.loc[meta.nostril == 'L', 'nostril'] = 'open'
meta.loc[meta.nostril == 'R', 'nostril'] = 'close'

# Assigning Occlu de up down genes, Santoro et al, 2020 Cell 
Occlu_diff_Olfr = pd.read_csv('../files/Santoro_2020/Occlu_diff_Olfr.csv', index_col = 0 )
meta['fold_diff'] = 'na'
meta.loc[meta.top_Olfr.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Open+_Olfr'].id.values), 'fold_diff'] = 'Close-_Olfr'
meta.loc[meta.top_Olfr.isin(Occlu_diff_Olfr[Occlu_diff_Olfr.fold_diff == 'Close+_Olfr'].id.values), 'fold_diff'] = 'Close+_Olfr'
# Assigning iR2 Olfr, generated via ALL WTvsKO Rhbdf2 samples
iR2 = pd.read_csv('../DE_out/WTvKO_ALL/DE_Olfr_WTvsKO_ALL.csv', index_col = 0 )
meta['iR2'] = 'na'
meta.loc[meta.top_Olfr.isin(iR2[(iR2.logFC < 0) & (iR2.FDR < 0.2)].symbol), 'iR2'] = 'KO-_Olfr'
meta.loc[meta.top_Olfr.isin(iR2[(iR2.logFC > 0) & (iR2.FDR < 0.2)].symbol), 'iR2'] = 'KO+_Olfr'

# Combine metadata 
adata.obs = adata.obs.merge(meta, 
                            left_index = True, 
                            right_index = True)
raw_adata = adata.copy()
# or_adata = adata[:,adata.var.index.str.contains('Olfr')].copy()

##### preprocessing ... 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Visualize the graphs above to decide the cutoff
adata = adata[adata.obs.pct_counts_mt < 6, :]
# adata = adata[adata.obs.n_genes_by_counts < 2500, :]

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)


In [None]:
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.leiden(adata, resolution = 0.2)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')
sc.tl.umap(adata)

# sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'], size = 5)
sc.pl.umap(adata, color=['nostril', 'fold_diff', 'iR2'], size = 5)
sc.pl.umap(adata, color=['Rhbdf2', 'S100a5'], size = 5)

In [None]:
# adata.write(results_file)


# exception, saving to not overwrite results_file
adata.write('../files/Tsukahara_2021/GSE173947_ChronicOccl_2.h5ad')

#### Rank genes group lieden 

In [None]:
results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results
adata = sc.read(results_file)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
diff_genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])[0:2]
diff_genes = diff_genes.to_dict(orient='list')

sc.pl.matrixplot(adata, diff_genes, 
                 vmin = 0, 
                 vmax = 5, 
                 groupby='leiden', cmap = 'viridis')

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
# Save to rgg to csv 
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores']})[0:100].to_csv('../output/Blobel_15045/tsukahara_occlusion/rank_gene_groups/rgg_score_lieden.csv')

#### umap, genes visualization

In [None]:
"""

"""
results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'
adata = anndata.read_h5ad(results_file)
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

# Group and label Rhbdf2 expression groups 
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
# Create a counts df for interested genes 
for _gene in interested_genes:
    umap_df[f'{_gene}_counts'] = adata.raw.X[:, adata.raw.var_names == _gene]
    
# umap_df.to_csv('../output/fig_image/umap/Tsukahara_narisOcclu/umap_df.csv')

In [None]:
umap_df.to_csv('../output/fig_image/umap/Tsukahara_narisOcclu/umap_df.csv')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(11,5))
sc.pl.umap(adata, color = ['Rhbdf2'], size = 15,
           frameon=False, show=False, ax=axes[0])
sc.pl.umap(adata, color = ['S100a5'], size = 15,
           frameon=False, show=False, ax=axes[1])

In [None]:
"""
Individual interested genes umap for Fig
"""

interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
for _gene in interested_genes:
    _gene_col = [_col for _col in umap_df.columns if _gene in _col][0]
    plot_df = umap_df.sort_values(_gene_col).copy()
    fig = px.scatter(x = plot_df.umap_x, 
                     y = plot_df.umap_y, 
                     color = plot_df[_gene_col], 
                     color_continuous_scale=[(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')] # Dark Blue
                    )

    fig.update_traces(marker={'size': 5})
    fig.update_layout(
                    title = f'{_gene} expression',
                    #   width=500, 
                    height=600,
                    plot_bgcolor='rgba(0,0,0,0)',
                    xaxis_visible=False,  
                    yaxis_visible=False,   
    )
    fig.update_yaxes(
        scaleanchor = "x",
        scaleratio = 1,
    )
    fig.show()
    # fig.write_html(f'../output/fig_image/umap/{_gene}_MOE.html')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
from matplotlib.ticker import ScalarFormatter



# custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', [(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')])
custom_cmap = 'viridis'

# Create subplot with multiple axes
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
# interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17']

fig, axes = plt.subplots(4, 2, figsize=(12, 20)
                        #  sharex=True, sharey=True
                         )
for ax, gene in zip_longest(axes.flatten(), interested_genes):
    if gene:
        gene_col = [col for col in umap_df.columns if gene in col][0]
        plot_df = umap_df.sort_values(gene_col).copy()

        # Plot scatter plot for each gene
        scatter = ax.scatter(plot_df['umap_x'], plot_df['umap_y'], 
                             c=plot_df[gene_col], cmap=custom_cmap, s=1, 
                             vmin=plot_df[gene_col].min(), 
                             vmax=plot_df[gene_col].max())
                            #  vmax=3)  # Hard define so the count on scale matches iR2, iR1
        ax.set_title(f'{gene} expression')
        ax.axis("off")
        ax.set_aspect('equal')
        
        # Create a ScalarMappable object for each subplot
        sm = ScalarMappable(cmap=custom_cmap, 
                            norm=plt.Normalize(vmin=plot_df[gene_col].min(), 
                                               vmax=plot_df[gene_col].max()))
        sm.set_array([])
        
        # Add colorbar for each subplot with actual value
        cbar = plt.colorbar(sm, ax=ax, label=gene, 
                            ticks=[plot_df[gene_col].min(), 
                                   plot_df[gene_col].median(), 
                                   plot_df[gene_col].max()], 
                             shrink=0.5)
        cbar.ax.set_yticklabels([f'{plot_df[gene_col].min():.0f}', 
                                 f'{plot_df[gene_col].median():.0f}',
                                #  f'{plot_df[gene_col].max():.0f}'])
                                 f'{3:.0f}']) # Hard define so the count on scale matches iR2, iR1

    else: 
        ax.axis("off")

# Adjust layout
plt.suptitle("")
plt.subplots_adjust(wspace=0, hspace=0.1)
# Show plot
fig.tight_layout()

# plt.savefig('../output/fig_image/umap/Tsukahara_narisOcclu/umap_activitygenes.png')
# plt.savefig('../output/fig_image/umap/Tsukahara_narisOcclu/umap_activitygenes_2.png')


####  open vs close nostril

##### Rank gene group

In [None]:
results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results
adata = sc.read(results_file)

In [None]:
sc.tl.rank_genes_groups(adata, 'nostril', method='wilcoxon')

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
# Save to rgg to csv 
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores']})[0:100]
# .to_csv('../output/Blobel_15045/tsukahara_occlusion/rank_gene_groups/rgg_score_nostril.csv')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=20)

In [None]:
diff_genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])[0:5]
diff_genes = diff_genes.to_dict(orient='list')

sc.pl.matrixplot(adata, diff_genes, 
                #  vmin = 0, 
                #  vmax = 5, 
                 groupby='nostril', cmap = 'viridis', 
                #  standard_scale='var',
                #  colorbar_title='column scaled\nexpression'
                 )

##### umap 

In [None]:
umap_df = pd.read_csv('../output/Blobel_15045/tsukahara_occlusion/umap/umap_df.csv', index_col=0)

In [None]:
# Occlusion umap 
manual_color = ['#624185', '#f2ad73']

plot_df = umap_df.copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y, 
                 color = plot_df.nostril
                 )

fig.update_traces(marker={'size': 3})

for i, f in enumerate(fig.data):
    f.marker.color = manual_color[i]

fig.update_layout(
                  width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_occlusion/umap/Nostril_umap.html')

##### iR2 expression between Olfr across Nostril

In [None]:
# Create a counts df for Rhbdf2
count_df = pd.DataFrame(adata.raw.X[:, adata.raw.var_names == 'Rhbdf2'], columns=['Rhbdf2_counts'])
count_df.insert(0, 'nostril', adata.obs['nostril'].values)
count_df.insert(0, 'top_Olfr', adata.obs['top_Olfr'].values)
count_df = count_df.groupby(['top_Olfr', 'nostril'], as_index = False).mean()

# Add back iR2 information 
count_df = pd.merge(count_df, 
                    adata.obs[['top_Olfr', 'iR2', 'fold_diff']].reset_index(drop = True).drop_duplicates(), 
                    on = 'top_Olfr', how ='left')

In [None]:

from scipy import stats


count_df = count_df.sort_values(['iR2', 'nostril'])

# Filterout olfrs that only has cells in either open or close 
na_olfr = count_df[count_df.Rhbdf2_counts.isna()].top_Olfr.values
count_df = count_df[~(count_df.top_Olfr.isin(na_olfr))]

fig = go.Figure()

# Track the first occurrence of each iR2 value to control legend entries
first_occurrence = set()

# Adding a line for each unique Olfr
for olfr in count_df['top_Olfr'].unique():
    subset = count_df[count_df['top_Olfr'] == olfr]

    fig.add_trace(go.Scatter(
        x=subset['nostril'], 
        y=subset['Rhbdf2_counts'], 
        mode='lines+markers', 
        hovertemplate=olfr, 
        name = olfr,
        showlegend=True,  # Show legend only for the first trace of each group
        opacity=0.5, 
        # line=dict(color=color_iR2[iR2_value])
    ))
    
    
# Since the olfr has individual points in fig.data. Manually calculate and annotate sig values instead. 
p_list = []

t, p = stats.ttest_rel(count_df[(count_df.nostril == 'close')].Rhbdf2_counts, 
                        count_df[(count_df.nostril == 'open')].Rhbdf2_counts)
if p >= 0.05:
    p_list.append(f'ns <br>p {round(p, 3)}<br>t {round(t,3)}')
elif p >= 0.01: 
    p_list.append(f'* <br>p {round(p, 3)}<br>t {round(t,3)}')
elif p >= 0.001:
    p_list.append(f'** <br>p {round(p, 3)}<br>t {round(t,3)}')
else:
    p_list.append(f'*** <br>p {round(p, 3)}<br>t {round(t,3)}')
    
    
fig = plot_utils.add_p_value_annotation(fig, [[0,1]], 
                                        just_annotate = p_list, 
                                        test_type = 'ranksums', 
                                        y_padding = False)

# Update layout
fig.update_layout(
    title='Line Graph of Rhbdf2 Counts by Nostril',
    xaxis_title='Nostril',
    yaxis_title='Rhbdf2 average per cell',
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=10,  # Set the font size here
    )
)

# Show the plot
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_occlusion/Rhbdf2_exp/Nostril_Rhbdf2mean.html')

In [None]:

from scipy import stats

color_iR2 = {'KO+_Olfr': '#EF5350', 
             'WT+_Olfr': '#66BB6A', 
             'na': '#D3D3D3'}

count_df['combined_category'] = count_df['iR2'].astype(str) + '_' + count_df['nostril'].astype(str)
count_df = count_df.sort_values(['iR2', 'nostril'])

# Filterout olfrs that only has cells in either open or close 
na_olfr = count_df[count_df.Rhbdf2_counts.isna()].top_Olfr.values
count_df = count_df[~(count_df.top_Olfr.isin(na_olfr))]

fig = go.Figure()

# Track the first occurrence of each iR2 value to control legend entries
first_occurrence = set()

# Adding a line for each unique Olfr
for olfr in count_df['top_Olfr'].unique():
    subset = count_df[count_df['top_Olfr'] == olfr]
    iR2_value = subset.iR2.unique().astype(str)[0]
    
    show_legend = False
    if iR2_value not in first_occurrence:
        show_legend = True
        first_occurrence.add(iR2_value)

    fig.add_trace(go.Scatter(
        x=subset['combined_category'], 
        y=subset['Rhbdf2_counts'], 
        mode='lines+markers', 
        hovertemplate=olfr, 
        name=iR2_value,  # Using iR2 value for name
        legendgroup=iR2_value,  # Group by iR2 value
        showlegend=show_legend,  # Show legend only for the first trace of each group
        opacity=0.5, 
        line=dict(color=color_iR2[iR2_value])
    ))
    
# Since the olfr has individual points in fig.data. Manually calculate and annotate sig values instead. 
p_list = []
for i, g in enumerate(count_df.iR2.unique()):
    t, p = stats.ttest_rel(count_df[(count_df.iR2 == g) & 
                                      (count_df.nostril == 'close')].Rhbdf2_counts, 
                             count_df[(count_df.iR2 == g) & 
                                      (count_df.nostril == 'open')].Rhbdf2_counts)
    if p >= 0.05:
        p_list.append(f'ns <br>p {round(p, 3)}<br>{g}')
    elif p >= 0.01: 
        p_list.append(f'* <br>p {round(p, 3)}<br>{g}')
    elif p >= 0.001:
        p_list.append(f'** <br>p {round(p, 3)}<br>{g}')
    else:
        p_list.append(f'*** <br>p {round(p, 3)}<br>{g}')
    
    
fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5]], 
                                        just_annotate = p_list, 
                                        test_type = 'ranksums', 
                                        y_padding = False)

# Update layout
fig.update_layout(
    title='Line Graph of Rhbdf2 Counts by Nostril',
    xaxis_title='Nostril',
    yaxis_title='Rhbdf2 average per cell',
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=10,  # Set the font size here
    )
)

# Show the plot
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_occlusion/Rhbdf2_exp/Nostril_iR2_Rhbdf2mean.html')

In [None]:


group = 'fold_diff'
color_group = {'Close+_Olfr': '#990011', 
               'Open+_Olfr': '#317773', 
               'na': '#D3D3D3'}

count_df['combined_category'] = count_df[group].astype(str) + '_' + count_df['nostril'].astype(str)
count_df = count_df.sort_values([group, 'nostril'])

# Filterout olfrs that only has cells in either open or close 
na_olfr = count_df[count_df.Rhbdf2_counts.isna()].top_Olfr.values
count_df = count_df[~(count_df.top_Olfr.isin(na_olfr))]

fig = go.Figure()

# Track the first occurrence of each iR2 value to control legend entries
first_occurrence = set()

# Adding a line for each unique Olfr
for olfr in count_df['top_Olfr'].unique():
    subset = count_df[count_df['top_Olfr'] == olfr]
    group_value = subset[group].unique().astype(str)[0]
    
    show_legend = False
    if group_value not in first_occurrence:
        show_legend = True
        first_occurrence.add(group_value)

    fig.add_trace(go.Scatter(
        x=subset['combined_category'], 
        y=subset['Rhbdf2_counts'], 
        mode='lines+markers', 
        hovertemplate=olfr, 
        name=group_value,  # Using iR2 value for name
        legendgroup=group_value,  # Group by iR2 value
        showlegend=show_legend,  # Show legend only for the first trace of each group
        opacity=0.5, 
        line=dict(color=color_group[group_value])
    ))
    
# Since the olfr has individual points in fig.data. Manually calculate and annotate sig values instead. 
p_list = []
for i, g in enumerate(count_df[group].unique()):
    _, p = stats.ttest_rel(count_df[(count_df[group] == g) & 
                                      (count_df.nostril == 'close')].Rhbdf2_counts, 
                             count_df[(count_df[group] == g) & 
                                      (count_df.nostril == 'open')].Rhbdf2_counts)
    if p >= 0.05:
        p_list.append(f'ns <br>{round(p, 3)}')
    elif p >= 0.01: 
        p_list.append(f'* <br>{round(p, 3)}')
    elif p >= 0.001:
        p_list.append(f'** <br>{round(p, 3)}')
    else:
        p_list.append(f'*** <br>{round(p, 3)}')
    
    
fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5]], 
                                        just_annotate = p_list, 
                                        test_type = 'ttest_rel', 
                                        y_padding = False)

# Update layout
fig.update_layout(
    title='Line Graph of Rhbdf2 Counts by Nostril',
    xaxis_title='Nostril',
    yaxis_title='Rhbdf2 average per cell',
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=10,  # Set the font size here
    )
)

# Show the plot
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_occlusion/Rhbdf2_exp/Nostril_Occlu_Rhbdf2mean.html')

In [None]:

# Calculate differences between close and open Rhbdf2 counts
for olfr in count_df.top_Olfr.unique():
    count_df.loc[(count_df.top_Olfr == olfr), 'pair_diff'] = (count_df.loc[(count_df.top_Olfr == olfr) & 
                                                                          (count_df.nostril == 'close')].Rhbdf2_counts.values - count_df.loc[(count_df.top_Olfr == olfr) & 
                                                                                                                                      (count_df.nostril == 'open')].Rhbdf2_counts.values)[0]
    
plot_df = count_df.drop_duplicates('top_Olfr')
# Group the data by 'group' and create histograms
plt.figure(figsize=(10, 6))
plt.axvline(x=0, color='black', alpha = 0.5,
            linestyle='--', dashes=(5,10)
            )

sns.kdeplot(data=plot_df, x='pair_diff', 
            hue='fold_diff', 
            common_norm=False, 
            fill=False, 
            alpha=0.6, linewidth=5, 
            palette = ['#990011','#317773', '#D3D3D3'],
            legend=True
)
plt.xlabel('Close vs Open Rhbdf2 count difference')
plt.ylabel('Density')
plt.title('Histogram of Difference by Group')
# plt.savefig('../output/Blobel_15045/tsukahara_occlusion/Olfr_nostril_distance/fold_diff_distance_kdeplot.png', dpi=300) 
plt.show()

  

#### Occlu down vs up 

##### Rank genes group 

In [None]:
results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results
adata = sc.read(results_file)
adata.obs.fold_diff[adata.obs.fold_diff.isna()] = 'na'

In [None]:
sc.tl.rank_genes_groups(adata, 'fold_diff', method='wilcoxon')

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
# Save to rgg to csv 
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores']})[0:100].to_csv('../output/Blobel_15045/tsukahara_occlusion/rank_gene_groups/rgg_score_Occlu.csv')

In [None]:
diff_genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])[0:5]
diff_genes = diff_genes.to_dict(orient='list')

sc.pl.matrixplot(adata, diff_genes, 
                #  vmin = 0, 
                #  vmax = 1, 
                 groupby='fold_diff', cmap = 'viridis', 
                 standard_scale='var',
                 colorbar_title='column scaled\nexpression')

##### umap sub-plots 

In [None]:
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

umap_df.fold_diff[umap_df.fold_diff.isna()] = 'na'
umap_df['fold_diff'] = pd.Categorical(umap_df['fold_diff'], ['Close+_Olfr', 'Open+_Olfr', 'na'])
umap_df = umap_df.sort_values('fold_diff')

# umap_df.to_csv('../output/Blobel_15045/tsukahara_occlusion/umap/umap_df.csv')

In [None]:
umap_df = pd.read_csv('../output/Blobel_15045/tsukahara_occlusion/umap/umap_df.csv', index_col = 0)
# umap_df['fold_diff'] = pd.Categorical(umap_df['fold_diff'], ['Close+_Olfr', 'Open+_Olfr', 'na'])


In [None]:
# Occlusion umap 
manual_color = ['#990011','#317773', '#D3D3D3'][::-1]
plot_df = umap_df.sort_values('fold_diff', ascending=False).copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y,
                 color = plot_df.fold_diff, 
                 facet_col = plot_df.nostril
                 )
fig.update_traces(marker={'size': 3, 'opacity': 0.5})

for i, f in enumerate(fig.data):
    f.marker.color = manual_color[i // 2 % len(manual_color)]

# Naked styling 
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(range=[plot_df.umap_x.min(), plot_df.umap_x.max()], showticklabels=False)
fig.update_yaxes(range=[plot_df.umap_y.min(), plot_df.umap_y.max()], showticklabels=False)
fig.update_layout(
                #   width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_occlusion/umap/Nostril_Occlu_umap.html')

In [None]:
# Calculate the distances dataframe 
distances_df = plot_utils.umap_euclidean_distance(umap_df, 
                                   by = 'fold_diff')
distances_mean = distances_df.groupby(['top_Olfr', 'group'], as_index=False).mean().sort_values('distance')

In [None]:
top_x = 10
manual_color = {'ACE2h': ['#D3D3D3', '#2ca02c'], 
                'OCT2h': ['#D3D3D3', '#d62728']}


for _odor in odors: 
    # apply true activated label by odor grouped Olfr first 
    Olfr_to_plot = distances_dict[_odor]['mean'][distances_dict[_odor]['mean'].group == True].sort_values('distance', ascending=False).top_Olfr.unique()
    Olfr_to_plot = [i for i in Olfr_to_plot if i in umap_df.top_Olfr.values][0:top_x]
    background_umap = umap_df.sample(frac=0.1)

    # Nostril_Olfr_Occlu-down individual Olfr umap 
    fig, axes = plt.subplots(2, 5, 
                            figsize=(15, 8), 
                            sharex=True, sharey=True)

    for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
        if olfr is not None: 
            # Plot background scatter
            ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                marker='o', alpha=0.3, label=None, 
                s=10, c='#D3D3D3')
            # Plot main scatter with color
            plot_df = umap_df[umap_df.odor.isin([_odor]+[control])]
            for _o, shape in zip(plot_df['odor'].unique(), ['o','x']):
                subset = plot_df[(plot_df['odor'] == _o) & 
                                (plot_df['top_Olfr'] == olfr)]
                ax.scatter(subset['umap_x'], subset['umap_y'], 
                        marker=shape, label=_o,
                        s=30,
                        c=manual_color[_odor][1])
                ax.set_title(olfr)
                ax.axis("off")
                ax.set_box_aspect(1)
        else: 
            ax.axis("off")
    ax.legend(loc="best", frameon=False)
    plt.suptitle(f'{_odor} active Olfr pairwise distance')
    plt.subplots_adjust(wspace=0, hspace=0.1)
    # Show plot
    fig.tight_layout()


In [None]:
# Nostril_Olfr_Occlu-down individual Olfr umap 
top_x = 10 # number of Olfr to plot 
plot_groups = ['Open+_Olfr', 'Close+_Olfr']


for _group in plot_groups: 
    Olfr_to_plot = distances_mean[distances_mean.group == _group].sort_values('distance', ascending=False).top_Olfr.unique()
    Olfr_to_plot = [i for i in Olfr_to_plot if i in umap_df.top_Olfr.values][0:top_x]

    fig, axes = plt.subplots(2, 5, 
                            figsize=(15, 8), 
                            sharex=True, sharey=True)

    background_umap = umap_df.sample(frac=0.5)
    for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
        if olfr is not None: 
            # Plot background scatter
            ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                    marker='o', alpha=0.5, label=None, 
                    s=3, c='#D3D3D3')

            # Plot main scatter with color
            
            plot_df = umap_df.copy()
            for nostril, color in zip(plot_df['nostril'].unique(), ['#624185','#ffa345']):
                subset = plot_df[(plot_df['nostril'] == nostril) & 
                                (plot_df['top_Olfr'] == olfr)]
                ax.scatter(subset['umap_x'], subset['umap_y'], marker='o', label=nostril, 
                        s=30, c=color)
                ax.set_title(olfr)
                ax.axis("off")
                ax.set_box_aspect(1)
        else: 
            ax.axis("off")
    ax.legend(loc="best", frameon=False)
    plt.suptitle(f'Nostril_Olfr_Occlu_{_group}')
    plt.subplots_adjust(wspace=0, hspace=0.1)
    # Show plot
    fig.tight_layout()

# plt.savefig('../output/Blobel_15045/tsukahara_occlusion/umap/Nostril_Olfr_Occlu_down.png', dpi=300) 

###### old deprecated

In [None]:
# Nostril_Olfr_Occlu-up individual Olfr umap 

fig, axes = plt.subplots(7, 12, 
                         figsize=(20, 15), 
                         sharex=True, sharey=True)

Olfr_to_plot = distances_mean[distances_mean.group == 'Close+_Olfr'].sort_values('distance', ascending=False).top_Olfr.unique()
Olfr_to_plot = [i for i in Olfr_to_plot if i in umap_df.top_Olfr.values]
for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
    # Plot background scatter
    if olfr is not None: 
        background_umap = umap_df.sample(frac=0.5)
        ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                marker='o', alpha=0.5, label=None, 
                s=3, c='#D3D3D3')

        # Plot main scatter with color
        
        plot_df = umap_df.copy()
        for nostril, color in zip(plot_df['nostril'].unique(), ['#624185','#ffa345']):
            subset = plot_df[(plot_df['nostril'] == nostril) & 
                            (plot_df['top_Olfr'] == olfr)]
            ax.scatter(subset['umap_x'], subset['umap_y'], marker='o', label=nostril, 
                    s=3, c=color)
            ax.set_title(olfr)
            ax.axis("off")
            ax.set_box_aspect(1)
    else: 
        ax.axis("off")
        
# ax.legend(loc="best")
plt.suptitle('Nostril_Olfr_Occlu_Close+_Olfr')
plt.subplots_adjust(wspace=0, hspace=0.1)
# Show plot
fig.tight_layout()

# plt.savefig('../output/Blobel_15045/tsukahara_occlusion/umap/Nostril_Olfr_Occlu_up.png', dpi=300) 

##### Olfr pairwise comparison 

In [None]:
# Euclidean distance box plots 

plot_df = distances_df[~(distances_df.group.isin(['na', 'shuffled']))]
plot_df = plot_df.sort_values(['group', 'distance'], ascending=[True, False])
fig = px.box(plot_df, 
             x="top_Olfr", 
             y="distance", 
             color = 'group',
            #  points="all"
             )

fig.update_traces(marker={'size': 2, 'opacity': 0.3})

# manually assign color
manual_color = ['#990011','#317773', '#D3D3D3']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]

fig.update_layout(
    yaxis = {'title' : 'Pairwise euclidean distance'},
    xaxis = {'title' : ''},
    title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=10,  # Set the font size here
    )
)

fig.show()
# fig.write_html("../output/Blobel_15045/tsukahara_occlusion/Olfr_nostril_distance/fold_diff_distance.html")


In [None]:
# Euclidean distance box plots 

plot_df = distances_df.copy()
fig = px.box(plot_df, 
             x="group", 
             y="distance", 
             color = 'group',
            #  points="all"
             )
# manually assign color
manual_color = ['#990011','#317773', '#D3D3D3', '#D3D3D3']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]

fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2], [2,3]], test_type='ranksums')

fig.update_layout(
    yaxis = {'title' : 'Pairwise euclidean distance'},
    xaxis = {'title' : ''},
    # title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>\
    # <span style="font-size: 10px;"> </span>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=150,pad=10),
    font=dict(
        size=15,  # Set the font size here
    )
)

fig.show()
# fig.write_html("../output/Blobel_15045/tsukahara_occlusion/Olfr_nostril_distance/fold_diff_distance_grouped.html")


In [None]:
# Group the data by 'group' and create histograms
plt.figure(figsize=(10, 6))

plot_df = distances_df[(distances_df.group.isin(['shuffled']))]
sns.kdeplot(data=plot_df, x='distance', 
            hue='group', 
            common_norm=False, 
            fill=False, 
            alpha=0.3, linewidth=3, 
            palette = ['#000000'],
            linestyle = '--', 
)
plot_df = distances_df[~(distances_df.group.isin(['shuffled']))]
sns.kdeplot(data=plot_df, x='distance', 
            hue='group', 
            common_norm=False, 
            fill=False, 
            alpha=0.6, linewidth=5, 
            palette = ['#990011','#317773', '#D3D3D3'],
            legend=True
)
plt.xlabel('Pairwise euclidean distance')
plt.ylabel('Density')
plt.title('Histogram of Distances by Group')
# plt.savefig('../output/Blobel_15045/tsukahara_occlusion/Olfr_nostril_distance/fold_diff_distance_kdeplot.png', dpi=300) 
plt.show()


#### iR2 WT vs KO 

##### Rank gene groups

In [None]:
# results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results
# adata = sc.read(results_file)

adata = sc.read('../files/Tsukahara_2021/GSE173947_ChronicOccl_2.h5ad')
# adata.obs.iR2[adata.obs.iR2.isna()] = 'na'

In [None]:
# Simply to visualize the labeling. Using DE_Olfr_WTvsKO_ALL for downstream analysis. 
iR2 = pd.read_csv('../DE_out/WTvKO_ALL/DE_Olfr_WTvsKO_ALL.csv', index_col = 0 )
adata.obs['iR2'] = 'na'
adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC < 0) & (iR2.FDR < 0.2)].symbol), 'iR2'] = 'KO-_Olfr'
adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC > 0) & (iR2.FDR < 0.2)].symbol), 'iR2'] = 'KO+_Olfr'

# iR2 = pd.read_csv('../DE_out/Blobel-15045/DE_Olfr_WTvsKO_n6.csv', index_col = 0 )
# adata.obs['iR2_n6'] = np.nan
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC < 0) & (iR2.FDR < 0.2)].symbol), 'iR2_n6'] = 'KO-_Olfr'
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC > 0) & (iR2.FDR < 0.2)].symbol), 'iR2_n6'] = 'KO+_Olfr'

# iR2 = pd.read_csv('../DE_out/WTvKO_ALL/DE_allgene_WTvsKO_ALL.csv', index_col = 0 )
# adata.obs['iR2_allgene'] = np.nan
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC < 0) & (iR2.FDR < 0.2)].symbol), 'iR2_allgene'] = 'KO-_Olfr'
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC > 0) & (iR2.FDR < 0.2)].symbol), 'iR2_allgene'] = 'KO+_Olfr'

# iR2 = pd.read_csv('../DE_out/Blobel-15045/DE_allgene_WTvsKO_n6.csv', index_col = 0 )
# adata.obs['iR2_n6_allgene'] = np.nan
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC < 0) & (iR2.FDR < 0.2)].symbol), 'iR2_n6_allgene'] = 'KO-_Olfr'
# adata.obs.loc[adata.obs.top_Olfr.isin(iR2[(iR2.logFC > 0) & (iR2.FDR < 0.2)].symbol), 'iR2_n6_allgene'] = 'KO+_Olfr'

# sc.pl.umap(adata, color=['iR2', 'iR2_n6', 'iR2_allgene', 'iR2_n6_allgene'], size = 30)
sc.pl.umap(adata, color=['iR2'], size = 30)


In [None]:
sc.tl.rank_genes_groups(adata, 'iR2', method='wilcoxon')

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
# Save to rgg to csv 
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores']})[0:100].to_csv('../output/Blobel_15045/tsukahara_occlusion/rank_gene_groups/rgg_score_iR2.csv')

In [None]:
diff_genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])[0:5]
diff_genes = diff_genes.to_dict(orient='list')

sc.pl.matrixplot(adata, diff_genes, 
                #  vmin = 0, 
                #  vmax = 1, 
                 groupby='iR2', cmap = 'viridis', 
                #  standard_scale='var',
                #  colorbar_title='column scaled\nexpression'
                 )

##### umap subplots

In [None]:
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

umap_df.iR2[umap_df.iR2.isna()] = 'na'
umap_df['iR2'] = pd.Categorical(umap_df['iR2'], ['KO+_Olfr', 'KO-_Olfr', 'na'])
umap_df = umap_df.sort_values('iR2')

umap_df.to_csv('../output/WTvKO_ALL/tsukahara_occlusion/umap/umap_df.csv')

In [None]:
umap_df = pd.read_csv('../output/WTvKO_ALL/tsukahara_occlusion/umap/umap_df.csv')

In [None]:
# iR2 umap 

manual_color = ['#EF5350','#66BB6A', '#D3D3D3'][::-1]

plot_df = umap_df.sort_values('iR2', ascending=False).copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y, 
                 color = plot_df.iR2
                 )

fig.update_traces(marker={'size': 3})

for i, f in enumerate(fig.data):
    f.marker.color = manual_color[i]

fig.update_layout(
                  width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/WTvKO_ALL/tsukahara_occlusion/umap/Nostril_iR2_umap.html')

In [None]:
# Occlusion umap 
manual_color = ['#EF5350','#66BB6A', '#D3D3D3'][::-1]
plot_df = umap_df.sort_values('iR2', ascending=False).copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y,
                 color = plot_df.iR2, 
                 facet_col = plot_df.nostril
                 )
fig.update_traces(marker={'size': 3})

for i, f in enumerate(fig.data):
    f.marker.color = manual_color[i // 2 % len(manual_color)]

# Naked styling 
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(range=[plot_df.umap_x.min(), plot_df.umap_x.max()], showticklabels=False)
fig.update_yaxes(range=[plot_df.umap_y.min(), plot_df.umap_y.max()], showticklabels=False)
fig.update_layout(
                #   width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/WTvKO_ALL/tsukahara_occlusion/umap/Nostril_iR2_umap_split.html')

In [None]:
# Calculate the distances dataframe 
distances_df = plot_utils.umap_euclidean_distance(umap_df, by = 'iR2')
distances_mean = distances_df.groupby(['top_Olfr', 'group'], as_index=False).mean().sort_values('distance')
distances_mean['group'] = pd.Categorical(distances_mean['group'], 
                                         categories=['KO+_Olfr', 'na', 'KO-_Olfr'],
                                         ordered=True)

In [None]:
# Nostril_Olfr_Occlu-down individual Olfr umap 
top_x = 10 # number of Olfr to plot 
plot_groups = ['KO+_Olfr', 'KO-_Olfr']


for _group in plot_groups: 
    Olfr_to_plot = distances_mean[distances_mean.group == _group].sort_values('distance', ascending=False).top_Olfr.unique()
    Olfr_to_plot = [i for i in Olfr_to_plot if i in umap_df.top_Olfr.values][0:top_x]

    fig, axes = plt.subplots(2, 5, 
                            figsize=(15, 8), 
                            sharex=True, sharey=True)

    background_umap = umap_df.sample(frac=0.5)
    for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
        if olfr is not None: 
            # Plot background scatter
            ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                    marker='o', alpha=0.5, label=None, 
                    s=3, c='#D3D3D3')

            # Plot main scatter with color
            
            plot_df = umap_df.copy()
            for nostril, color in zip(plot_df['nostril'].unique(), ['#624185','#ffa345']):
                subset = plot_df[(plot_df['nostril'] == nostril) & 
                                (plot_df['top_Olfr'] == olfr)]
                ax.scatter(subset['umap_x'], subset['umap_y'], marker='o', label=nostril, 
                        s=30, c=color)
                ax.set_title(olfr)
                ax.axis("off")
                ax.set_box_aspect(1)
        else: 
            ax.axis("off")
    ax.legend(loc="best", frameon=False)
    plt.suptitle(f'Nostril_Olfr_Occlu_{_group}')
    plt.subplots_adjust(wspace=0, hspace=0.1)
    # Show plot
    fig.tight_layout()

# plt.savefig('../output/WTvKO_ALL/tsukahara_occlusion/umap/Nostril_Olfr_Occlu_down.png', dpi=300) 

In [None]:
list(group_Olfr[int(len(group_Olfr)/2):int(len(group_Olfr)/2)+1])

In [None]:
distances_df.group.unique()

In [None]:
olfr

In [None]:
distances_mean[distances_mean['top_Olfr'] == olfr].group.item()

In [None]:
"""
For fig. Plot representative of median distance for both KO+ and KO- OR groups 
"""

# Nostril_Olfr_Occlu-down individual Olfr umap 
top_x = 10 # number of Olfr to plot 
plot_groups = ['KO+_Olfr', 'KO-_Olfr']


distances_mean = distances_mean[~distances_mean['group'].isna()].reset_index(drop=True)

Olfr_to_plot = []
for _group in plot_groups: 
    group_Olfr = distances_mean[distances_mean.group == _group].sort_values('distance', ascending=False).top_Olfr.unique()
    Olfr_to_plot += list(group_Olfr[int(len(group_Olfr)/2):int(len(group_Olfr)/2)+4])

fig, axes = plt.subplots(2, 4, 
                        figsize=(15, 8), 
                        sharex=True, sharey=True)
background_umap = umap_df.sample(frac=0.5)
for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
    if olfr is not None: 
        # Plot background scatter
        ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                marker='o', alpha=0.5, label=None, 
                s=3, c='#D3D3D3')

        # Plot main scatter with color
        
        plot_df = umap_df.copy()
        for nostril, color in zip(plot_df['nostril'].unique(), ['#624185','#ffa345']):
            subset = plot_df[(plot_df['nostril'] == nostril) & 
                            (plot_df['top_Olfr'] == olfr)]
            ax.scatter(subset['umap_x'], subset['umap_y'], marker='o', label=nostril, 
                    s=30, c=color)
            ax.set_title(f'{distances_mean[distances_mean["top_Olfr"] == olfr].group.item()}_{olfr}')
            ax.axis("off")
            ax.set_box_aspect(1)
    else: 
        ax.axis("off")
ax.legend(loc="best", frameon=False)
# plt.suptitle(f'Nostril_Olfr_Occlu_{_group}')
plt.subplots_adjust(wspace=0, hspace=0.1)
# Show plot
fig.tight_layout()

# plt.savefig('../output/WTvKO_ALL/tsukahara_occlusion/umap/Nostril_Olfr_Occlu_down.png', dpi=300) 

##### Olfr pairwise comparison 

In [None]:
umap_df = pd.read_csv('../output/WTvKO_ALL/tsukahara_occlusion/umap/umap_df.csv', index_col = 0)[['nostril', 'iR2', 'umap_x', 'umap_y', 'top_Olfr']]
umap_df.head()

In [None]:
# Calculate the distances dataframe 
distances_df = plot_utils.umap_euclidean_distance(umap_df, by = 'iR2')
distances_mean = distances_df.groupby(['top_Olfr', 'group'], as_index=False).mean().sort_values('distance')
distances_mean['group'] = pd.Categorical(distances_mean['group'], 
                                         categories=['KO+_Olfr', 'na', 'KO-_Olfr'],
                                         ordered=True)

In [None]:
# Euclidean distance box plots 

plot_df = distances_df[~(distances_df.group.isin(['na', 'shuffled']))]
plot_df = plot_df.sort_values(['group', 'distance'], ascending=[True, False])
fig = px.box(plot_df, 
             x="top_Olfr", 
             y="distance", 
             color = 'group',
            #  points="all"
             )

fig.update_traces(marker={'size': 2, 'opacity': 0.3})

# manually assign color
manual_color = ['#EF5350','#66BB6A', '#D3D3D3']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]

fig.update_layout(
    yaxis = {'title' : 'Pairwise euclidean distance'},
    xaxis = {'title' : ''},
    title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>\
    <span style="font-size: 10px;"> </span>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=15,  # Set the font size here
    )
)

fig.show()
# fig.write_html("../output/WTvKO_ALL/tsukahara_occlusion/Olfr_nostril_distance/iR2_distance.html")


In [None]:
# Euclidean distance box plots 

# min_n = min(distances_df.value_counts('group')) 
min_n = 1000

# Re-order group 
distances_df['group'] = pd.Categorical(distances_df['group'], ['KO+_Olfr', 'na', 'KO-_Olfr'])
plot_df = distances_df.groupby('group').apply(lambda x: x.sample(min_n, 
                                                                 random_state=0
                                                                 )).reset_index(drop=True)
plot_df = plot_df[plot_df.group != 'shuffled']

plot_df = plot_df.sort_values('group')
fig = go.Figure()
for _group in plot_df.group.unique(): 
    fig.add_trace(go.Violin(x=plot_df[plot_df.group == _group].group, 
                            y=plot_df[plot_df.group == _group].distance, 
                            opacity=0.8,
                            name=_group,
                            points = 'all', pointpos=0, 
                            meanline_visible=True
                            ))
    
# manually assign color
# manual_color = ['#EF5350', '#D3D3D3','#66BB6A']

manual_color = plot_utils.distinct_colors(plot_df.group.unique(), 
                                          custom_color=['#EF5350', '#D3D3D3','#19b2e6'])
for i, _data in enumerate(fig.data): 
    fig.data[i]['marker']['color'] = manual_color[_data['name']]

fig = plot_utils.downsample_fig(fig, max_points=200)
fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], test_type='ranksums')

fig.update_layout(
    yaxis = {'title' : 'Pairwise euclidean distance'},
    xaxis = {'title' : ''},
    # title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>\
    # <span style="font-size: 10px;"> </span>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=150,pad=10),
    font=dict(
        size=15,  # Set the font size here
    )
)
fig.show()
# fig.write_html("../output/WTvKO_ALL/tsukahara_occlusion/Olfr_nostril_distance/iR2_distance_grouped.html")
# fig.write_html('../output/fig_image/Violin/tsukahara_occlusion/iR2_distance_grouped.html')

In [None]:
# Group the data by 'group' and create histograms
plt.figure(figsize=(8, 2.5))

# plot_df = distances_df[(distances_df.group.isin(['shuffled']))]
# sns.kdeplot(data=plot_df, x='distance', 
#             hue='group', 
#             common_norm=False, 
#             fill=False, 
#             alpha=0.3, linewidth=3, 
#             palette = ['#000000'],
#             linestyle = '--', 
# )
min_n = 1000
distances_df['group'] = pd.Categorical(distances_df['group'], ['KO+_Olfr', 'na', 'KO-_Olfr'])
plot_df = distances_df.groupby('group').apply(lambda x: x.sample(min_n, 
                                                                 random_state=0
                                                                 )).reset_index(drop=True)
plot_df = plot_df[plot_df.group != 'shuffled']
sns.kdeplot(data=plot_df, x='distance', 
            hue='group', 
            common_norm=False, 
            fill=False, 
            alpha=0.6, linewidth=5, 
            palette = ['#EF5350', '#D3D3D3','#19b2e6'],
            legend=True
)
plt.xlabel('Pairwise euclidean distance')
plt.ylabel('Density')
plt.title('KDE of Distances by Group')
plt.grid(False)
plt.show()
# plt.savefig('../output/WTvKO_ALL/tsukahara_occlusion/Olfr_nostril_distance/iR2_distance_kdeplot.png', dpi=300) 

###### Validation of shuffle

In [None]:
# Testing of different fractions of shuffle distribution

import random

shuffled_distances = []

groups = ['shuffled_0', 'shuffled_1', 'shuffled_2', 'shuffled_3']
shuffled_fraction = [0.5, 0.25, 0.1, 0.05]
shuffled_num = [17, 17, 17, 17]
for group, shuffled_frac in zip(groups, shuffled_num):
    
    # unique_top_Olfr = umap_df.sample(frac = shuffled_frac)['top_Olfr'].unique()
    unique_top_Olfr = random.sample(list(umap_df['top_Olfr'].unique()), 
                                    # int(len(umap_df['top_Olfr'].unique())*shuffled_frac))
                                    shuffled_frac)
    for olfr in unique_top_Olfr:
        olfr_data = umap_df[umap_df['top_Olfr'] == olfr]
        open_coords = olfr_data[olfr_data['nostril'] == 'open'][['umap_x', 'umap_y']].values
        close_coords = olfr_data[olfr_data['nostril'] == 'close'][['umap_x', 'umap_y']].values

        for open_point in open_coords:
            for close_point in close_coords:
                dist = distance.euclidean(open_point, close_point)
                shuffled_distances.append({'top_Olfr': olfr, 'distance': dist, 'group': group})
            

# Create a DataFrame with pairwise distances
shuffled_distances_df = pd.DataFrame(shuffled_distances)
shuffled_distances_df.hist(column = 'distance', by='group', bins = 30)

##### GO 

In [None]:
# Import GO_tools and initialize GO functions. May take a few seconds. 
import go_utils

In [None]:
iR2_markers_df = pd.read_csv('../output/Blobel_15045/tsukahara_occlusion/rank_gene_groups/rgg_score_iR2.csv', index_col = 0)

In [None]:
# Assign go terms 

go_num = 20

go_dict = {}
# Extracts the group name from markers_df
markers_name = ['_'.join(col.split('_')[:-1]) for col in iR2_markers_df.columns[::3] ]
for group in markers_name: 
    # Subset dataframe and sort by padj values 
    subset = iR2_markers_df[iR2_markers_df.columns[iR2_markers_df.columns.str.contains(group, regex=False)]]
    genes = subset.sort_values(group+'_pvals_adj')[group+'_names'][0:go_num]
    
    go_dict[group] = go_utils.go_it(genes)
    go_dict[group]['n_genes/n_go'] = go_dict[group].n_genes/go_dict[group].n_go
    go_dict[group]['n_genes/n_study'] = go_dict[group].n_genes/go_dict[group].n_study
    go_dict[group]['group'] = group

go_df = pd.concat(go_dict, ignore_index=True)
go_df.to_csv('../output/Blobel_15045/tsukahara_occlusion/GO/iR2_GOterms_top20.csv')
# go_df.to_csv('../output/Blobel_15045/tsukahara_occlusion/GO/Nostril_iR2_GOterms_top20.csv')

In [None]:
# Visualize GO 

fig = px.bar(go_df, 
            x='n_genes', 
            y='term', 
            orientation='h',
            color = 'group', 
            barmode = 'group', 
            hover_data=['study_genes']).update_layout(
                plot_bgcolor='rgba(0, 0, 0, 0)'
                )
            
# manually assign color
manual_color = ['#EF5350','#66BB6A', '#D3D3D3']
for i in range(len(fig.data)): 
    fig.data[i]['marker']['color'] = manual_color[i]

            
fig.update_layout(
    yaxis_title='FDR',
    autosize=True,
    template='simple_white'
)
fig.show()
# fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/GO/iR2_GO_top20.html')
# fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/GO/Nostril_iR2_GO_top100.html')



#### Compare Rhbdf2 vs S100a5 expression 

In [None]:
# results_file = '../files/Tsukahara_2021/GSE173947_ChronicOccl.h5ad'  # the file that will store the analysis results
# adata = sc.read(results_file)

adata = sc.read('../files/Tsukahara_2021/GSE173947_ChronicOccl_2.h5ad')
iR2 = pd.read_csv('../files/iR2_Olfr.csv', index_col=0)

adata.obs['iR2'] = 'na'
FDR_cutoff = 0.05
adata.obs.loc[adata.obs.top_Olfr.isin(iR2[iR2['iR2'] == 'KO-_Olfr'].symbol), 'iR2'] = 'KO-_Olfr'
adata.obs.loc[adata.obs.top_Olfr.isin(iR2[iR2['iR2'] == 'KO+_Olfr'].symbol), 'iR2'] = 'KO+_Olfr'

In [None]:
# Group and label Rhbdf2 expression groups 
# interested_genes = ['Rhbdf2', 'S100a5', 'Kirrel2', 'Lrrc3b']
interested_genes = ['Rhbdf2', 'S100a5']

interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
# Create a counts df for interested genes 
count_df = pd.DataFrame(adata.raw.X[:, adata.raw.var_names.isin(interested_genes)], columns=[i+"_counts" for i in interested_genes])
count_df.insert(0, 'iR2', adata.obs['iR2'].values)
count_df.insert(0, 'fold_diff', adata.obs['fold_diff'].values)
count_df.insert(0, 'nostril', adata.obs['nostril'].values)
count_df.insert(0, 'Olfr', adata.obs['top_Olfr'].values)

count_df['Rhbdf2_present'] = count_df['Rhbdf2_counts'].round().apply(str)
count_df.loc[~(count_df['Rhbdf2_present'] == '0.0'), 'Rhbdf2_present'] = "1+"
count_df.loc[count_df['Rhbdf2_present'] == '0.0', 'Rhbdf2_present'] = "0"

count_df['iR2'] = pd.Categorical(count_df['iR2'], 
                                 categories=['KO+_Olfr', 'na', 'KO-_Olfr'],
                                 ordered=True)

# Add columns of pseudocounts to avoid account for nans upon division 
count_df['Rhbdf2_counts_ps'] = count_df['Rhbdf2_counts'] + 1e-3
count_df['S100a5_counts_ps'] = count_df['S100a5_counts'] + 1e-3

In [None]:
import importlib
importlib.reload(plot_utils)

In [None]:

plot_groups = ['iR2']
manual_color = {'iR2': ['#EF5350','#D3D3D3','#66BB6A'],
                'fold_diff': ['#990011', '#D3D3D3', '#317773']}
for _group in plot_groups: 
    y_column = list(count_df.columns[count_df.columns.str.endswith('_counts')])
    plot_df = pd.melt(count_df, id_vars=['Olfr', _group, 'nostril'], value_vars=y_column)
    plot_df = plot_df.sort_values(['nostril', _group], ascending=[True, True])
    
    for _gene in plot_df.variable.unique():
        fig = go.Figure()
        subset = plot_df[plot_df['variable'] == _gene]
        
        min_n = min(subset.value_counts(_group)) 
        subset = subset.groupby(_group).apply(lambda x: x.sample(min_n, 
                                                                  random_state=10
                                                                  )).reset_index(drop=True)
        
        grouped_df = subset.groupby([_group, 'nostril', 'variable'])
        

        # Iterate over each group in the grouped DataFrame
        for g, data in grouped_df:
            # Extract the values for the box plot
            values = data['value']
            group_str = f'{g[0]}_{g[1]}'  # Customize the group label
            # Create a box plot trace
            fig.add_trace(go.Violin(y=values, name=group_str, 
                                    opacity=0.8,
                                    points='all', pointpos=0))
            fig.update_traces(meanline_visible=True)
                
        for i in range(len(fig.data)): 
            fig.data[i]['marker'] = {'color': manual_color[_group][i // 2 % len(manual_color[_group])]}

        
        # fig = plot_utils.downsample_fig(fig, max_points=100)
        fig = plot_utils.downsample_fig(fig)
        # fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5]], 
        #                                         y_padding=False,
        #                                         test_type = 'ranksums', 
        #                                         include_tstat=True)
        fig = plot_utils.add_p_value_annotation(fig, [[1,3], [1,5], [3,5]], 
                                                test_type = 'ranksums', 
                                                include_tstat=True)
        # fig = plot_utils.add_p_value_annotation(fig, [[0,2], [0,4], [2,4]], 
        #                                         test_type = 'ranksums', 
        #                                         include_tstat=True)

        fig.update_layout(
            yaxis = {'title' : f'{g[2].split("_")[0]} expression'},
            xaxis = {'title' : ''},
            title=f'Relative Expression of genes to Rhbdf2 in nostril<br>',
            autosize=True,
            template='simple_white',
            margin=dict(l=50,r=50,b=100,t=150,pad=10),
            font=dict(
                size=10,  # Set the font size here
            ), 
            showlegend = False
        )
        
        fig.show()
        # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/S100a5_exp/{_group}_grouped_{_gene}_exp_violin.html') 


In [None]:
# Pairwise comaprison of Rhbdf2 counts between open / close for specifc Olfr

 
from itertools import product


compare_list = ['Rhbdf2_counts_ps', 'S100a5_counts_ps']
method_list = ['Pairwise_logfc_mean']
for compare in compare_list : 
    
    # ONLY LOOK AT Rhbdf2 + cells
    # grouped = count_df[count_df[compare] > 0].groupby('Olfr')
    grouped = count_df.groupby('Olfr')
    pairwise_diffs = pd.DataFrame()
    for olfr, group_df in grouped:
        open_counts = group_df[group_df['nostril'] == 'open'][compare].values
        close_counts = group_df[group_df['nostril'] == 'close'][compare].values
        if len(open_counts) > 0 and len(close_counts) > 0:
            pair_diff = [ np.log10(b/a) for a, b in product(open_counts, close_counts)]
            pairwise_diffs = pairwise_diffs.append({'symbol': olfr, 
                                                    'Pairwise_logfc_mean': np.mean(pair_diff), 
                                                    'Pairwise_logfc': pair_diff},
                                                    ignore_index=True)
            
    for _method in method_list: 
        # Join with iR2 for labels 
        plot_df = pd.merge(pairwise_diffs, iR2[['symbol', 'iR2']], on='symbol', how='left').replace(np.nan, 'na' )
        plot_df = plot_df.sort_values('iR2')
        fig = go.Figure()
        # Iterate over each group in the grouped DataFrame
        for _group in ['KO+_Olfr', 'na', 'KO-_Olfr']:
            if _method == 'Pairwise_logfc': 
                # Create a box plot trace
                y_data = []
                for _olfr in plot_df[plot_df['iR2'] == _group].symbol.unique():
                    y_data += plot_df[plot_df.symbol == _olfr][_method].item()
                fig.add_trace(go.Violin(x=[_group]*len(y_data),
                                        y=y_data, 
                                        name=_group, 
                                        meanline_visible=True, 
                                        points = 'all', pointpos=0
                                        ))
            elif _method == 'Pairwise_logfc_mean':
                subset = plot_df[plot_df['iR2'] == _group]
                fig.add_trace(go.Violin(x=subset['iR2'],
                                        y=subset[_method], 
                                        name=_group, 
                                        text = subset['symbol'],
                                        meanline_visible=True, 
                                        points = 'all', pointpos=0
                                        ))
            else: 
                print("ERROR______")
                break
        
        manual_color = plot_utils.distinct_colors(plot_df['iR2'].unique(), custom_color = ['#EF5350','#D3D3D3', '#66BB6A'])
        for i, _group in enumerate(manual_color): 
            fig.data[i]['marker'] = {'color': manual_color[_group]}


        # Down sample from trace 
        if _method == 'Pairwise_logfc': 
            fig = plot_utils.downsample_fig(fig, max_points_pct = 0.05)  
            # fig = plot_utils.downsample_fig(fig)  
        elif _method == 'Pairwise_logfc_mean': 
            fig = plot_utils.downsample_fig(fig, max_points = 500)  

        fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], 
                                                test_type = 'ranksums', include_tstat=True)
        
        
        fig.update_layout(
            yaxis = {'title' : f'Pair-wise differlogFCence ({compare.split("_")[0]} close/open)'},
            xaxis = {'title' : ''},
            title=f'{compare.split("_")[0]} Pair-wise logFC between nostril<br>',
            autosize=True,
            template='simple_white',
            margin=dict(l=50,r=50,b=100,t=200,pad=10),
            font=dict(
                size=10,  # Set the font size here
            ), 
            showlegend = False
        )
        fig.show()
        # fig.write_html(f'../output/WTvKO_ALL/tsukahara_occlusion/exp_plots/Pairwise_{compare}_{_method}_violin.html') 
        fig.write_html(f'../output/fig_image/Violin/Pairwise_{compare}_{_method}_violin.html') 

In [None]:
# TODO testing... ... .. 
# Fold change of gene expression between nostrils across groups
 
# For generating mean ratio 
group = 'iR2'
# compare_list = ['S100a5_counts_ps', 'Rhbdf2_counts_ps']
# compare_list = ['S100a5_counts', 'Rhbdf2_counts']
compare_list = ['Rhbdf2_counts']
# method_list = ['ratio', 'logFC']
method_list = ['logFC']
# compare = 'Rhbdf2_counts_ps'

for method in method_list: 
    for compare in compare_list: 
        if method == 'ratio': 
            # count_df['Rhbdf2_counts_ps'] = count_df['Rhbdf2_counts'] + 1e-6
            mean_df = count_df.groupby(['Olfr', 'nostril'])[compare].mean().unstack().reset_index()
            mean_df[method] = mean_df['close'] / mean_df['open']
            plot_df = pd.merge(mean_df, count_df[['Olfr', 'iR2']].drop_duplicates(), on='Olfr', how='left')
            
        if method == 'difference': 
            # count_df['Rhbdf2_counts_ps'] = count_df['Rhbdf2_counts'] + 1e-6
            mean_df = count_df.groupby(['Olfr', 'nostril'])[compare].mean().unstack().reset_index()
            mean_df[method] = mean_df['close'] - mean_df['open']
            plot_df = pd.merge(mean_df, count_df[['Olfr', 'iR2']].drop_duplicates(), on='Olfr', how='left')

        # For generating logFC comparison 
        elif method == 'logFC':
            logFC_values = []
            for olfr, _group in count_df.groupby('Olfr'):
                open_values = _group[_group['nostril'] == 'open'][compare].tolist()
                close_values = _group[_group['nostril'] == 'close'][compare].tolist()
                if len(open_values) > 0 and len(close_values) > 0:
                    logFC = np.log(np.mean(close_values) / np.mean(open_values))
                    logFC_values.append({'Olfr': olfr, 'logFC': logFC})
            plot_df = pd.merge(pd.DataFrame(logFC_values), count_df[['Olfr', 'iR2']].drop_duplicates(), on='Olfr', how='left')
        
        # Replace nan, inf, -inf and sort by group
        plot_df = plot_df.replace([np.inf, -np.inf],np.nan).dropna().sort_values(group)

        fig = go.Figure()
        # Iterate over each group in the grouped DataFrame
        for _group in plot_df[group].unique():
            # Create a box plot trace
            fig.add_trace(go.Violin(x=plot_df[plot_df[group] == _group].iR2,
                                    y=plot_df[plot_df[group] == _group][method], 
                                    name=_group, 
                                    meanline_visible=True, 
                                    points = 'all', pointpos=0))
                
        manual_color = plot_utils.distinct_colors(plot_df[group].unique(), custom_color = ['#EF5350','#D3D3D3', '#66BB6A'])
        for i, _group in enumerate(manual_color): 
            fig.data[i]['marker'] = {'color': manual_color[_group]}

        
        # Down sample from trace 
        fig = plot_utils.downsample_fig(fig, max_points = 300)    
        # Stat test 
        fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], 
                                                test_type = 'ranksums', include_tstat=True)

        fig.update_layout(
            yaxis = {'title' : f'{method}({compare.split("_")[0]} close/open)'},
            xaxis = {'title' : ''},
            title=f'{method} of {compare.split("_")[0]} between nostril<br>',
            autosize=True,
            template='simple_white',
            margin=dict(l=50,r=50,b=100,t=100,pad=10),
            font=dict(
                size=10,  # Set the font size here
            ), 
            showlegend = False
        )

        fig.show()
        # fig.write_html(f'../output/WTvKO_ALL/tsukahara_occlusion/exp_plots/{group}_{compare}_{method}_violin.html') 
        # fig.write_html(f'../output/WTvKO_ALL/tsukahara_occlusion/exp_plots/{group}_{compare}_{method}_violin.html') 

##### Old Deprecated plots 

In [None]:
# Deprecated
# Quick plot to see how the other interested genes correslate with Rhbdf2
 
group = 'iR2'
plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'Rhbdf2_present', 'nostril'], value_vars=['Rhbdf2_counts'])
plot_df = plot_df.sort_values([group, 'Rhbdf2_present'], ascending=[True, True])


for n in plot_df.nostril.unique():
    fig = go.Figure()
    subset = plot_df[plot_df['nostril'] == n]
    # subset = subset[subset['Rhbdf2_present'] == '1+']
    subset['Rhbdf2_present'] = 'all'
    grouped_df = subset.groupby([group, 'variable', 'Rhbdf2_present'])
    # Iterate over each group in the grouped DataFrame
    for g, data in grouped_df:
        # Downsample 'na'
        if g[0] == 'na': 
            data = data.sample(len(subset[(subset['iR2'] == 'KO+_Olfr')]))
        # Extract the values for the box plot
        values = data['value']
        group_str = f'{g[0]}_{g[2]}_{g[1].split("_")[0]}'  # Customize the group label

        # Create a box plot trace
        # fig.add_trace(go.Box(y=values, name=group_str))
        fig.add_trace(go.Violin(y=values, name=group_str, points='all', pointpos=0))
        fig.update_traces(meanline_visible=True)
            
    manual_color = ['#EF5350','#66BB6A','#D3D3D3']
    for i in range(len(fig.data)): 
        fig.data[i]['marker'] = {'color': manual_color[i // 1 % len(manual_color)]}
        # fig.data[i]['marker'] = {'color': manual_color[i // 6 % len(manual_color)]}


    fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], 
                                            # y_padding=False,
                                            test_type = 'ranksums')

    fig.update_layout(
        yaxis = {'title' : 'Rhbdf2 expression'},
        xaxis = {'title' : ''},
        title=f'Relative Expression of Rhbdf2 in {n} nostril<br>',
        autosize=True,
        template='simple_white',
        margin=dict(l=50,r=50,b=100,t=100,pad=10),
        font=dict(
            size=10,  # Set the font size here
        ), 
        showlegend = False
    )
    
    fig.show()
    # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/Rhbdf2_exp/iR2_{n}_Rhbdf2_expression_violin.html') 


In [None]:
# Deprecated
# Quick plot to see how the other interested genes correslate with Rhbdf2
 
group = 'iR2'
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'Rhbdf2_present', 'nostril'], value_vars=y_column)
plot_df = plot_df.sort_values([group, 'Rhbdf2_present'], ascending=[True, True])


for n in plot_df.nostril.unique():
    fig = go.Figure()
    subset = plot_df[plot_df['nostril'] == n]
    grouped_df = subset.groupby([group, 'variable', 'Rhbdf2_present'])
    # Iterate over each group in the grouped DataFrame
    for g, data in grouped_df:
        # Extract the values for the box plot
        values = data['value']
        group_str = f'{g[0]}_{g[2]}_{g[1].split("_")[0]}'  # Customize the group label

        # Create a box plot trace
        fig.add_trace(go.Box(y=values, name=group_str))
            
    manual_color = ['#EF5350','#66BB6A','#D3D3D3']
    for i in range(len(fig.data)): 
        fig.data[i]['marker'] = {'color': manual_color[i // 2 % len(manual_color)]}
        # fig.data[i]['marker'] = {'color': manual_color[i // 6 % len(manual_color)]}


    fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5]], 
                                            y_padding=False,
                                            test_type = 'ranksums')
    # fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5], 
    #                                             [6,7], [8,9], [10,11], 
    #                                             [12,13], [14,15], [16,17]], 
    #                                         test_type = 'ranksums', 
    #                                         y_padding=False)


    fig.update_layout(
        yaxis = {'title' : ''},
        xaxis = {'title' : ''},
        title=f'Relative Expression of genes to Rhbdf2 in {n} nostril<br>',
        autosize=True,
        template='simple_white',
        margin=dict(l=50,r=50,b=100,t=100,pad=10),
        font=dict(
            size=10,  # Set the font size here
        ), 
        showlegend = False
    )
    
    fig.show()
    # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/S100a5_exp/iR2_{n}_S100a5_expression_box.html') 
    # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/S100a5_exp/iR2_{n}_activitygene_expression_box.html')   


In [None]:
# Deprecated 
# Quick plot to see how the other interested genes correslate with Rhbdf2
 
group = 'fold_diff'
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'Rhbdf2_present', 'nostril'], value_vars=y_column)
plot_df = plot_df.sort_values([group, 'Rhbdf2_present'], ascending=[True, True])



for n in plot_df.nostril.unique():
    fig = go.Figure()
    subset = plot_df[plot_df['nostril'] == n]
    grouped_df = subset.groupby([group, 'variable', 'Rhbdf2_present'])
    # Iterate over each group in the grouped DataFrame
    for g, data in grouped_df:
        # Extract the values for the box plot
        values = data['value']
        group_str = f'{g[0]}_{g[2]}_{g[1].split("_")[0]}'  # Customize the group label

        # Create a box plot trace
        fig.add_trace(go.Box(y=values, name=group_str))
            
    manual_color = ['#990011', '#317773', '#D3D3D3']
    for i in range(len(fig.data)): 
        # fig.data[i]['marker'] = {'color': manual_color[i // 6 % len(manual_color)]}
        fig.data[i]['marker'] = {'color': manual_color[i // 2 % len(manual_color)]}



    fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5]], 
                                            y_padding=False,
                                            test_type = 'ranksums')
    # fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [4,5], 
    #                                             [6,7], [8,9], [10,11], 
    #                                             [12,13], [14,15], [16,17]], 
    #                                         test_type = 'ranksums', 
    #                                         y_padding=False)


    fig.update_layout(
        yaxis = {'title' : ''},
        xaxis = {'title' : ''},
        title=f'Relative Expression of genes to Rhbdf2 in {n} nostril<br>',
        autosize=True,
        template='simple_white',
        margin=dict(l=50,r=50,b=100,t=100,pad=10),
        font=dict(
            size=10,  # Set the font size here
        ), 
        showlegend = False
    )
    
    fig.show()
    # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/S100a5_exp/Occlu_{n}_S100a5_expression_box.html') 
    # fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/S100a5_exp/Occlu_{n}_activitygene_expression_box.html')   


In [None]:
# Deprecated
# TODO Attempt to make fodlchange violin plots between close / open 
# Quick plot to see how the other interested genes correslate with Rhbdf2
 
group = 'iR2'
plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'Rhbdf2_present', 'nostril'], value_vars=['Rhbdf2_counts'])
plot_df = plot_df.sort_values([group, 'Rhbdf2_present'], ascending=[True, True])

# Generate fold change dataframe between close / open 
fold_change_df = pd.DataFrame(columns=['Olfr','iR2', 'fold_change'])
zero_pad = 1e-6
for _olfr in plot_df.Olfr.unique():
    subset = plot_df[plot_df.Olfr == _olfr]
    if (len(subset[subset.nostril == 'open']) == 0) | (len(subset[subset.nostril == 'close']) == 0): 
        continue 
    fold_change_df.loc[len(fold_change_df)] = [_olfr, subset.iR2.values[0],  
                                               (subset[subset.nostril == 'close'].value + zero_pad).mean() / (subset[subset.nostril == 'open'].value + zero_pad).mean()]
    
    
fig = go.Figure()
# Iterate over each group in the grouped DataFrame
for _group in fold_change_df.iR2.unique():
    # Extract the values for the box plot
    values = fold_change_df[fold_change_df['iR2'] == _group].fold_change
    # group_str = f'{g[0]}_{g[2]}_{g[1].split("_")[0]}'  # Customize the group label

    # Create a box plot trace
    # fig.add_trace(go.Box(y=values, name=group_str))
    fig.add_trace(go.Violin(y=values, name=_group, points='all', pointpos=0))
    fig.update_traces(meanline_visible=True)
        
manual_color = ['#EF5350','#66BB6A','#D3D3D3']
for i in range(len(fig.data)): 
    fig.data[i]['marker'] = {'color': manual_color[i // 1 % len(manual_color)]}
    # fig.data[i]['marker'] = {'color': manual_color[i // 6 % len(manual_color)]}


# fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], 
#                                         # y_padding=False,
#                                         test_type = 'ranksums')

fig.update_layout(
    yaxis = {'title' : 'Rhbdf2 expression'},
    xaxis = {'title' : ''},
    title=f'Relative Expression of Rhbdf2 in {n} nostril<br>',
    autosize=True,
    template='simple_white',
    margin=dict(l=50,r=50,b=100,t=100,pad=10),
    font=dict(
        size=10,  # Set the font size here
    ), 
    showlegend = False
)

fig.show()
# fig.write_html(f'../output/Blobel_15045/tsukahara_occlusion/Rhbdf2_exp/iR2_{n}_Rhbdf2_expression_violin.html') 


#### Olfr population between nostrils

In [None]:
umap_df = pd.read_csv('../output/WTvKO_ALL/tsukahara_occlusion/umap/umap_df.csv', index_col=0)

In [None]:
# Calculate a dataframe for a given type of Olfr, what is the ratio betwen the counts of close / open

padding = 1e-6

# Create an empty list to store the results
results = []

# Iterate over each unique 'top_Olfr' value
for _olfr in umap_df['top_Olfr'].unique():
    olfr_df = umap_df[umap_df['top_Olfr'] == _olfr]
    close_count = len(olfr_df[olfr_df['nostril'] == 'close'])
    open_count = len(olfr_df[olfr_df['nostril'] == 'open'])    

    # Calculate logFC
    logFC = np.log((close_count + padding) / (open_count + padding))
    log1pFC = np.log1p(close_count) - np.log1p(open_count)
    
    # Calculate relative change
    if (open_count != 0) & (close_count != 0): 
        relative_change = (close_count - open_count) / (open_count )
    else: 
        relative_change = open_count if open_count != 0 else -close_count
    
    # Append the results to the list
    results.append({'Olfr': _olfr, 
                    'Close_Count': close_count, 
                    'Open_Count': open_count, 
                    'iR2': olfr_df['iR2'].unique().item(), 
                    'logFC': logFC,
                    'log1pFC': log1pFC, 
                    'Relative_Change': relative_change})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
results_df.loc[results_df.logFC == -np.inf, 'logFC'] = -10

results_df['group'] = pd.Categorical(results_df['iR2'], ['KO+_Olfr', 'na', 'KO-_Olfr'])

# Calculate the proportion of counts compared to the total
results_df['total_counts'] = results_df['Close_Count'] + results_df['Open_Count']
results_df['proportion'] = results_df['total_counts'] / results_df['total_counts'].sum()

# Apply effect size based on the proportion of counts
results_df['logFC_Normalized'] = results_df['logFC'] * results_df['proportion']
results_df['log1pFC_Normalized'] = results_df['log1pFC'] * results_df['proportion']
results_df['Relative_Change_Normalized'] = results_df['Relative_Change'] * results_df['proportion']


# Display the results DataFrame
results_df


In [None]:
# Euclidean distance box plots 

results_df['group'] = pd.Categorical(results_df['iR2'], ['KO+_Olfr', 'na', 'KO-_Olfr'])
plot_df = results_df.sort_values('group')

# Threshold min cell number
# plot_df = plot_df[plot_df['total_counts'] > 5]


# plot = ['logFC', 'logFC_Normalized', 'Relative_Change', 'Relative_Change_Normalized']
plot = [ 'log1pFC_Normalized']

for _plot_by in plot: 
    fig = go.Figure()
    for _group in plot_df.group.unique(): 
        fig.add_trace(go.Violin(x=plot_df[plot_df.group == _group].group, 
                                # y=plot_df[plot_df.group == _group].ratio, 
                                y=plot_df[plot_df.group == _group][_plot_by], 
                                opacity=0.8,
                                name=_group,
                                points = 'all', pointpos=0, 
                                meanline_visible=True, 
                                showlegend=False
                                ))
        
    # manually assign color
    # manual_color = ['#EF5350', '#D3D3D3','#66BB6A']

    manual_color = plot_utils.distinct_colors(plot_df.group.unique(), 
                                            custom_color=['#EF5350', '#D3D3D3','#19b2e6'])
    for i, _data in enumerate(fig.data): 
        fig.data[i]['marker']['color'] = manual_color[_data['name']]

    fig = plot_utils.downsample_fig(fig, max_points=300, sample_method='linspace')
    # fig = plot_utils.add_p_value_annotation(fig, [[0,1], [0,2], [1,2]], test_type='ranksums', include_tstat=True)
    fig = plot_utils.add_p_value_annotation(fig, [[0,0], [1,1], [2,2]], test_type='ttest_1samp', popmean = 0,  include_tstat=True, y_padding=False)

    fig.update_layout(
        yaxis = {'title' : f'Olfr {_plot_by} (close / open)'},
        xaxis = {'title' : ''},
        # title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>\
        # <span style="font-size: 10px;"> </span>',
        autosize=True,
        template='simple_white',
        margin=dict(l=50,r=50,b=50,t=80,pad=10),
        font=dict(
            size=15,  # Set the font size here
        )
    )
    fig.show()
    # fig.write_html(f'../output/WTvKO_ALL/tsukahara_occlusion/Olfr_population_difference/iR2_counts_nostril_{_plot_by}.html')
    fig.write_html(f'../output/fig_image/Violin/tsukahara_occlusion/iR2_counts_nostril_{_plot_by}.html')

In [None]:
# Group the data by 'group' and create histograms
plt.figure(figsize=(8, 2.5))
results_df['group'] = pd.Categorical(results_df['iR2'], ['KO+_Olfr', 'na', 'KO-_Olfr'])
sns.kdeplot(data=results_df, x='log1pFC_Normalized', 
            hue='group', 
            common_norm=False, 
            fill=False, 
            alpha=0.6, linewidth=5, 
            palette = ['#EF5350', '#D3D3D3','#19b2e6'],
            legend=True
)
plt.xlabel('Pairwise euclidean distance')
plt.ylabel('Density')
plt.title('KDE of Distances by Group')
plt.grid(False)
plt.show()

### Tsukahara Act-Seq 

#### 24hr Act-Seq

In [None]:
adata = anndata.read_csv('../files/Tsukahara_2021/GSE173947_ActSeq_umi_counts.csv')
meta = pd.read_csv('../files/Tsukahara_2021/GSE173947_ActSeq_metadata.csv', index_col = 0 )

# Combine metadata 
adata.obs = adata.obs.merge(meta, 
                            left_index = True, 
                            right_index = True)
raw_adata = adata.copy()


##### preprocessing ... 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

results_file = '../files/Tsukahara_2021/GSE173947_ActSeq.h5ad'  # the file that will store the analysis results

In [None]:
# sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Visualize the graphs above to decide the cutoff
# adata = adata[adata.obs.pct_counts_mt < 7, :]
# adata = adata[adata.obs.n_genes_by_counts < 2500, :]

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata

# adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
# sc.pp.scale(adata, max_value=10)


In [None]:
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.leiden(adata, resolution = 0.2)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')
sc.tl.umap(adata)

# sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'], size = 5)
sc.pl.umap(adata, color=['odor'], size = 5)
sc.pl.umap(adata, color=['Rhbdf2', 'S100a5'], size = 5)

In [None]:
# Activation genes from Tsukahara 2021
sc.pl.umap(adata, color=['Btg2', 'Egr1', 'Fos', 'Fosb', 'Gm13889', 
                         'Junb', 'Nr4a1', 'Nr4a2', 'Pcdh10', 'Srxn1'], size = 10)

##### Replicating Tsukahara data 

In [None]:
# Adds iR2 count information to obs
if "iR2_counts" not in adata.obs or adata.obs["iR2_counts"].isna().any():
        adata.obs["iR2_counts"] = adata.raw.X[:, adata.var_names == 'Rhbdf2'].max(1).flatten()
        adata.obs["iR2_counts_norm"] = (
            adata.obs["iR2_counts"] / adata.obs.total_counts * 1e4
        )

if "S100a5_counts" not in adata.obs or adata.obs["S100a5_counts"].isna().any():
        adata.obs["S100a5_counts"] = adata.raw.X[:, adata.var_names == 'S100a5'].max(1).flatten()
        adata.obs["S100a5_counts_norm"] = (
            adata.obs["S100a5_counts"] / adata.obs.total_counts * 1e4
        )

In [None]:
import utils.act as act
import utils.osn.olfr as olfr

df_OR, has_OR = olfr.get_OR_info(adata)

# Adds activated column to adata.obs 
# Adjusted Z_thresh based on activated Olfr graph
# adata.obs.ieg_z_scored.hist(by=adata.obs.odor, figsize=(10,10))

act.apply_activation_score(adata, Z_thresh=0)
adata.write(results_file)

In [None]:
adata.obs.ieg_z_scored.hist(by=adata.obs.odor, figsize=(10,10))

In [None]:
odor_dict = act.get_activated_olfrs(adata, df_OR, odors=['ACE2h', 'OCT2h'], 
                                    DPG='DPG30m')
# example ORs with high activation scores
col_show = ["activated", "activation_score", "ieg_z_scored"]
for k, v in odor_dict.items():
    print(k)
    display(v["delta"][col_show].nlargest(10, "activation_score"))

In [None]:
ACETO = "ACE2h"
OCT = "OCT2h"
DPG = "DPG30m"
odors=[ACETO, OCT]
color_dict = {DPG: plt.cm.tab10(0), ACETO: plt.cm.tab10(2), OCT: plt.cm.Set1(0)}

odor_mapping = {OCT: "Octanal OR", ACETO: "Aceto OR"}

fig, axes = plt.subplots(1, 2, figsize=(8, 6), sharey=True)

for ax, _odor in zip(axes, odors):
    this_dict = odor_dict[_odor]
    is_active = this_dict["active"]
    df_mean = this_dict["df"].reset_index()
    names = ["other OR",  odor_mapping[_odor]]
    df_mean['active'] = df_mean.is_active.map(dict(zip([False, True], names)))
    sns.boxenplot(
        data=df_mean,
        x="odor",
        y="ieg_z_scored",
        hue="active",
        ax=ax,
        order=(DPG, _odor),
        palette = ["0.8", color_dict[_odor]],
        hue_order=names,
        k_depth="proportion"
    )
    plot_utils.update_boxen(ax)
    ax.set_title(_odor)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
    ax.legend(frameon=False)
sns.despine()
axes[0].set_ylabel("IEG score")

##### iRhom2 expression change in activated vs not activated mOSNs

In [None]:
import utils.act as act
import utils.osn.olfr as olfr

results_file = '../files/Tsukahara_2021/GSE173947_ActSeq.h5ad'  # the file that will store the analysis results
adata = sc.read(results_file)

In [None]:
# Get iR2 and S100a5 counts. No need to norm ourself, since adata.raw is already normalized
if "iR2_counts" not in adata.obs or adata.obs["iR2_counts"].isna().any():
    adata.obs["iR2_counts"] = adata.raw.X[:, adata.var_names == 'Rhbdf2'].max(1).flatten()
    # adata.obs["iR2_counts_norm"] = (
    #     adata.obs["iR2_counts"] / adata.obs.total_counts * 1e4
    # )

if "S100a5_counts" not in adata.obs or adata.obs["S100a5_counts"].isna().any():
    adata.obs["S100a5_counts"] = adata.raw.X[:, adata.var_names == 'S100a5'].max(1).flatten()
    # adata.obs["S100a5_counts_norm"] = (
    #     adata.obs["S100a5_counts"] / adata.obs.total_counts * 1e4
    # )

###### Benchmarking for ieg_z_score cutoff

In [None]:
sns.set(style="white")

# Create subplots based on 'odor' column
g = sns.FacetGrid(adata.obs, col='odor', sharex=True, col_wrap=2)
g.map(sns.kdeplot, 'ieg_z_scored', fill=True, common_norm=False)

for ax in g.axes:
    ax.axvline(x=-0.184, color='grey', linestyle='--')

plt.show()


In [None]:
# Cutoff from X standard deviation of control mean 
top_x_std = 3
ieg_control_info = {'mean': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].mean(), 
                    'std': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].std()}

adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].hist(figsize=(5,5), bins=50)
print(ieg_control_info['mean'] + ieg_control_info['std'] * top_x_std)

In [None]:
# Cutoff from top X percentage of OSN in control 
top_x_pct = 5
n_DPG_cells = len(adata.obs.loc[adata.obs.odor == control])
print(adata.obs.loc[adata.obs.odor == control].sort_values('ieg_z_scored', ascending=False).iloc[int(n_DPG_cells * top_x_pct/100)]['ieg_z_scored'])

In [None]:
odors = ['ACE2h', 'OCT2h']
control = 'DPG2h'
z_cutoff_std = 2
Olfr_active_pct = 0.7

ieg_control_info = {'mean': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].mean(), 
                    'std': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].std()}

df_OR, has_OR = olfr.get_OR_info(adata, extra_cols=['odor','iR2_counts', 'S100a5_counts'])

# Set Z_threshhold 
print(f"Thresholding with {z_cutoff_std} std above ieg mean ({round(ieg_control_info['mean'] + ieg_control_info['std'] * z_cutoff_std, 3)}), and {Olfr_active_pct*100}% of Olfr active")
act.apply_activation_score(adata, 
                           Z_thresh=ieg_control_info['mean'] + ieg_control_info['std'] * z_cutoff_std)
odor_dict = act.get_activated_olfrs(adata, df_OR, 
                                    pct_thresh=Olfr_active_pct,
                                    odors=odors, DPG=control)
# col_show = ["activated", "activation_score", "iR2_counts", 'S100a5_counts']
# for k, v in odor_dict.items():
#     print(k)
#     display(v["delta"][col_show].nlargest(10, "activation_score"))
#     display(v["delta"][col_show].nsmallest(10, "activation_score"))

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
control = 'DPG2h'
plot_odor = ['ACE2h', 'OCT2h']
manual_color = {'ACE2h': ['#D3D3D3', '#2ca02c'], 
                'OCT2h': ['#D3D3D3', '#d62728']}
plot_facet = ['Rhbdf2_counts', 'S100a5_counts']
manual_bar_loc = [0,0.5,1.5,2]

# Group and label Rhbdf2 expression groups 
count_df = adata.obs[['top_Olfr', 'odor', 'activated', 'iR2_counts', 'S100a5_counts']].reset_index(drop=True)
count_df = count_df.rename(columns={'top_Olfr': 'Olfr', 'iR2_counts': 'Rhbdf2_counts'})

for f in plot_facet:
    group = 'activated'
    plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'odor'], value_vars=f)
    plot_df = plot_df.sort_values([group], ascending=[True])
    for o in plot_odor:
        fig = go.Figure()
        # Subset for data with query odor and control 
        subset = plot_df[plot_df['odor'].isin([o, control])]
        # Create a binary category for Olfr activated by query odor 
        o_activated_Olfr = subset[(subset.odor == o) & 
                                (subset.activated)].Olfr.unique().to_list()
        subset[f"{o}_Olfr"] = subset['Olfr'].apply(lambda x: x in odor_dict[o]['active'].index[odor_dict[o]['active'] == True])
        # subset[f"{o}_Olfr"] = subset[f"{o}_Olfr"].map(dict(zip([False, True], ['inactive', 'activated'])))
        
        subset['odor'] = pd.Categorical(subset['odor'], [control, o])
        subset = subset.sort_values(['odor', f"{o}_Olfr"], ascending=[True, True])

        grouped_df = subset.groupby(['odor', f"{o}_Olfr", 'variable'])
        # Iterate over each group in the grouped DataFrame
        for i, (g, data) in enumerate(grouped_df):
            # Extract the values for the box plot
            values = data['value']
            group_str = f'{g[0]}_{g[1]}_{g[2].split("_")[0]}'  # Customize the group label

            # Create a box plot trace
            # fig.add_trace(go.Box(y=values, name=group_str, boxpoints = 'all'))
            fig.add_trace(go.Violin(y=values, name=group_str,  scalegroup='Yes'))
            fig.update_traces(meanline_visible=True)
            # fig.add_trace(go.Scatter(x=manual_bar_loc[i] + np.random.rand(len(values))*0.2,
            #                          y=values, name=group_str, mode='markers', 
            #                          marker=dict(size=10, opacity=0.3)))
                
        for i in range(len(fig.data)): 
            fig.data[i]['marker'] = {'color': manual_color[o][i // 1 % len(manual_color)]}

        fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3]], 
                                                y_padding=False,
                                                test_type = 'ranksums')


        fig.update_layout(
            yaxis = {'title' : ''},
            xaxis = {'title' : ''},
            # title=f'Relative Expression of genes to Rhbdf2 in {n} nostril<br>',
            autosize=True,
            template='simple_white',
            margin=dict(l=50,r=50,b=100,t=100,pad=10),
            font=dict(
                size=10,  # Set the font size here
            ), 
            showlegend = False
        )
        
        fig.show()
        # fig.write_html(f'../output/Blobel_15045/tsukahara_active/{f.split("_")[0]}_{o}_box.html')   


##### continue .. 

In [None]:
# Redefining "Activated" ORs. 
odors = ['ACE2h', 'OCT2h']
control = 'DPG2h'
z_cutoff_std = 2
Olfr_active_pct = 0.7

ieg_control_info = {'mean': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].mean(), 
                    'std': adata.obs.loc[adata.obs.odor == control]['ieg_z_scored'].std()}

df_OR, has_OR = olfr.get_OR_info(adata, extra_cols=['odor','iR2_counts', 'S100a5_counts'])

# Set Z_threshhold 
print(f"Thresholding with {z_cutoff_std} std above ieg mean ({round(ieg_control_info['mean'] + ieg_control_info['std'] * z_cutoff_std, 3)}), and {Olfr_active_pct*100}% of Olfr active")
act.apply_activation_score(adata, 
                           Z_thresh=ieg_control_info['mean'] + ieg_control_info['std'] * z_cutoff_std)
odor_dict = act.get_activated_olfrs(adata, df_OR, 
                                    pct_thresh=Olfr_active_pct,
                                    odors=odors, DPG=control)
# example ORs with high activation scores
# col_show = ["activated", "activation_score", "iR2_counts", "iR2_counts_norm", 'S100a5_counts', 'S100a5_counts_norm']
# for k, v in odor_dict.items():
#     print(k)
#     display(v["delta"][col_show].nlargest(10, "activation_score"))
#     display(v["delta"][col_show].nsmallest(10, "activation_score"))

In [None]:
# Group and label Rhbdf2 expression groups 
count_df = adata.obs[['top_Olfr', 'odor', 'activated', 'iR2_counts', 'S100a5_counts']].reset_index(drop=True)
count_df = count_df.rename(columns={'top_Olfr': 'Olfr', 'iR2_counts': 'Rhbdf2_counts'})

count_df['Rhbdf2_present'] = count_df['Rhbdf2_counts'].round().apply(str)
count_df.loc[~(count_df['Rhbdf2_present'] == '0.0'), 'Rhbdf2_present'] = "1+"
count_df.loc[count_df['Rhbdf2_present'] == '0.0', 'Rhbdf2_present'] = "0"

In [None]:
# Quick plot to see how the other interested genes correslate with Rhbdf2
# control = 'DPG2h'
plot_odor = ['ACE2h']
# manual_color = plot_utils.distinct_colors(['activated'], category='pastel')
manual_color = {'activated': '#FFBF00', 
                'inactive': '#ededed'}
plot_facet = ['Rhbdf2_counts', 'S100a5_counts']

for f in plot_facet:
    group = 'activated'
    plot_df = pd.melt(count_df, id_vars=['Olfr', group, 'odor'], value_vars=f)
    plot_df = plot_df.sort_values([group], ascending=[True])
    for _odor in plot_odor:
        fig = go.Figure()
        # Subset for data with query odor and control 
        subset = plot_df[plot_df['odor'].isin([_odor, control])]
        # Create a binary category for Olfr activated by query odor 
        subset[f"{_odor}_Olfr"] = subset['Olfr'].apply(lambda x: x in odor_dict[_odor]['active'].index[odor_dict[_odor]['active'] == True])
        subset[f"{_odor}_Olfr"] = subset[f"{_odor}_Olfr"].map(dict(zip([False, True], ['inactive', 'activated'])))
        subset[f"{_odor}_Olfr"] = pd.Categorical(subset[f"{_odor}_Olfr"], ['inactive', 'activated'])
        subset['odor'] = pd.Categorical(subset['odor'], [control, _odor])
        subset = subset.sort_values(['odor', f"{_odor}_Olfr"], ascending=[True, True])
        grouped_df = subset.groupby(['odor', f"{_odor}_Olfr", 'variable'])
        # Iterate over each group in the grouped DataFrame
        for i, (g, data) in enumerate(grouped_df):
            # Extract the values for the box plot
            values = data['value']
            group_str = f'{g[0]}_{g[1]}'  # Customize the group label
            # Create a box plot trace
            fig.add_trace(go.Violin(y=values, 
                                    name=group_str,  
                                    scalemode='count', 
                                    points = 'all', pointpos = 0
                                    ))
            fig.update_traces(meanline_visible=True)

        fig.update_layout(yaxis = {'title' : f'{g[2].split("_")[0]} expression'},
                          xaxis = {'title' : ''},
                          autosize=True,
                          template='simple_white',
                          margin=dict(l=50,r=50,b=50,t=80,pad=10),
                          font=dict(size=10), 
                          showlegend = False)
        
        # Adjust color 
        for i in range(len(fig.data)): 
            fig.data[i]['marker'] = {'color': manual_color[fig.data[i].name.split('_')[1]]}

        fig = plot_utils.downsample_fig(fig, sample_method='linspace', max_points = 1000)
        fig = plot_utils.add_p_value_annotation(fig, [[0,1], [2,3], [1,3]], 
                                                # y_padding=False,
                                                test_type = 'ranksums', include_tstat=True)
        fig = plot_utils.downsample_fig(fig, max_points_pct=0.1)

        fig.show()
        # fig.write_html(f'../output/Blobel_15045/tsukahara_active/{f.split("_")[0]}_{_odor}_violin.html')   
        fig.write_html(f'../output/fig_image/Violin/tsukahara_active/{f.split("_")[0]}_{_odor}_violin.html')   
#

In [None]:
control_odor = 'DPG2h'
plot_odor = ['ACE2h', 'OCT2h']
plot_facet = ['Rhbdf2_counts', 'S100a5_counts']

for f in plot_facet:
    # Group the data by 'group' and create histograms
    plot_df = count_df.groupby(['Olfr', 'activated', 'odor'])[f].mean().sort_values().reset_index().dropna()

    fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True)
    for ax, o in zip(axes, plot_odor):
        subset = plot_df[plot_df.odor.isin([control_odor, o])]
        subset[f"{o}_Olfr"] = subset['Olfr'].apply(lambda x: x in odor_dict[o]['active'].index[odor_dict[o]['active'] == True])
        subset[f"{o}_Olfr"] = subset[f"{o}_Olfr"].map(dict(zip([False, True], ["other OR", f"{o} OR"])))


        subset_odor = subset[subset.odor == control_odor]
        sns.kdeplot(data=subset_odor, 
                    ax=ax,
                    x=f, 
                    hue=f"{o}_Olfr", 
                    common_norm=False, 
                    fill=False, 
                    palette = manual_color[o],
                    alpha=0.3, linewidth=3, 
                    linestyle = '--',
                    legend=True
        )
        subset_odor = subset[subset.odor == o]
        sns.kdeplot(data=subset_odor, 
                    ax=ax,
                    x=f, 
                    hue=f"{o}_Olfr", 
                    common_norm=False, 
                    fill=False, 
                    alpha=0.6, linewidth=5, 
                    palette = manual_color[o],
                    legend=True
        )

        ax.grid(False)
        ax.set_title(o)
        ax.set_xlabel(f)
        ax.set_ylabel('Density')
    sns.despine()
    # fig.savefig(f'../output/Blobel_15045/tsukahara_active/{f}_kde.png', dpi=300)


In [None]:
# Do a boxen plot to better show the axis 
fig, axes = plt.subplots(1, 2, figsize=(8, 6), sharey=True)
for ax, _odor in zip(axes, odors):
    this_dict = odor_dict[_odor]
    is_active = this_dict["active"]
    df_mean = this_dict["df"].reset_index()
    names = ["other OR",  f"{_odor} OR"]
    df_mean['active'] = df_mean.is_active.map(dict(zip([False, True], names)))
    sns.boxenplot(
        data=df_mean,
        x="odor",
        y="iR2_counts",
        hue="active",
        ax=ax,
        order=(control, _odor),
        palette = ["0.8", color_dict[_odor]],
        hue_order=names,
        k_depth="proportion"
    )
    plot_utils.update_boxen(ax)
    ax.grid(False)
    ax.set_title(_odor)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
    ax.legend(frameon=False)
sns.despine()
axes[0].set_ylabel("iR2 counts norm")
# fig.savefig('../output/Blobel_15045/tsukahara_active/Rhbdf2_boxen.png', dpi=300)


In [None]:
# Do a boxen plot to better show the axis 
fig, axes = plt.subplots(1, 2, figsize=(8, 6), sharey=True)
for ax, _odor in zip(axes, odors):
    this_dict = odor_dict[_odor]
    is_active = this_dict["active"]
    df_mean = this_dict["df"].reset_index()
    names = ["other OR",  f"{_odor} OR"]
    df_mean['active'] = df_mean.is_active.map(dict(zip([False, True], names)))
    sns.boxenplot(
        data=df_mean,
        x="odor",
        y="S100a5_counts",
        hue="active",
        ax=ax,
        order=(control, _odor),
        palette = ["0.8", color_dict[_odor]],
        hue_order=names,
        k_depth="proportion"
    )
    plot_utils.update_boxen(ax)
    ax.grid(False)
    ax.set_title(_odor)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
    ax.legend(frameon=False, loc='upper left')
sns.despine()
axes[0].set_ylabel("S100a5 counts norm")
# fig.savefig('../output/Blobel_15045/tsukahara_active/S100a5_boxen.png', dpi=300)


##### Pairwise euclidean distance active vs inactive

In [None]:
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

# umap_df.to_csv('../output/Blobel_15045/tsukahara_active/umap/umap_df.csv')

In [None]:
umap_df = pd.read_csv('../output/Blobel_15045/tsukahara_active/umap/umap_df.csv', index_col=0)


odors = ['ACE2h', 'OCT2h']
control = 'DPG2h'
# Filter for just plotting controls 
umap_df = umap_df[umap_df.odor.isin(odors + [control])]

In [None]:
# Active Olfr umap 
plot_df = umap_df.sort_values('activated', ascending=False).copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y,
                 color = plot_df.activated, 
                 facet_col = plot_df.odor
                 )
fig.update_traces(marker={'size': 3, 'opacity': 0.5})

for i, f in enumerate(fig.data):
    if f['legendgroup'] == 'False':
        f.marker.color = '#D3D3D3'

# Naked styling 
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(range=[plot_df.umap_x.min(), plot_df.umap_x.max()], showticklabels=False)
fig.update_yaxes(range=[plot_df.umap_y.min(), plot_df.umap_y.max()], showticklabels=False)
fig.update_layout(
                #   width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_active/umap/ActiveOlfr_umap.html')

In [None]:
# Active Olfr umap 
plot_df = umap_df.sort_values('activated', ascending=False).copy()
fig = px.scatter(x = plot_df.umap_x, 
                 y = plot_df.umap_y,
                 color = plot_df.ieg_z_scored, 
                 facet_col = plot_df.odor, 
                 color_continuous_scale='viridis'
                 )
fig.update_traces(marker={'size': 3, 'opacity': 0.5})

# Naked styling 
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(range=[plot_df.umap_x.min(), plot_df.umap_x.max()], showticklabels=False)
fig.update_yaxes(range=[plot_df.umap_y.min(), plot_df.umap_y.max()], showticklabels=False)
fig.update_layout(
                #   width=600, height=600,
                  plot_bgcolor='rgba(0,0,0,0)',
                  xaxis_visible=False,  
                  yaxis_visible=False,   
    )
fig.show()
# fig.write_html('../output/Blobel_15045/tsukahara_active/umap/ieg_umap.html')

In [None]:
import importlib

importlib.reload(plot_utils)


In [None]:
# Generate distance dict for odors 
odors = ['ACE2h', 'OCT2h']
control = 'DPG2h'

distances_dict = {}
for _odor in odors: 
    subset = umap_df[umap_df.odor.isin([_odor]+[control])]
    subset[f"{_odor}_Olfr"] = subset['top_Olfr'].apply(lambda x: x in odor_dict[_odor]['active'].index[odor_dict[_odor]['active'] == True])

    # Calculate the distances dataframe 
    distances_dict[_odor] = {}
    distances_dict[_odor]['df'] = plot_utils.umap_euclidean_distance(subset, 
                                                      by = f"{_odor}_Olfr", 
                                                      between='odor')
    distances_dict[_odor]['mean'] = distances_dict[_odor]['df'].groupby(['top_Olfr', 'group'], as_index=False).mean().sort_values('distance')
    

In [None]:
top_x = 10
manual_color = {'ACE2h': ['#D3D3D3', '#2ca02c'], 
                'OCT2h': ['#D3D3D3', '#d62728']}


for _odor in odors: 
    # apply true activated label by odor grouped Olfr first 
    Olfr_to_plot = distances_dict[_odor]['mean'][distances_dict[_odor]['mean'].group == True].sort_values('distance', ascending=False).top_Olfr.unique()
    Olfr_to_plot = [i for i in Olfr_to_plot if i in umap_df.top_Olfr.values][0:top_x]
    background_umap = umap_df.sample(frac=0.1)

    # Nostril_Olfr_Occlu-down individual Olfr umap 
    fig, axes = plt.subplots(2, 5, 
                            figsize=(15, 8), 
                            sharex=True, sharey=True)

    for ax, olfr in zip_longest(axes.flatten(), Olfr_to_plot):
        if olfr is not None: 
            # Plot background scatter
            ax.scatter(background_umap['umap_x'], background_umap['umap_y'], 
                marker='o', alpha=0.3, label=None, 
                s=10, c='#D3D3D3')
            # Plot main scatter with color
            plot_df = umap_df[umap_df.odor.isin([_odor]+[control])]
            for _o, shape in zip(plot_df['odor'].unique(), ['o','x']):
                subset = plot_df[(plot_df['odor'] == _o) & 
                                (plot_df['top_Olfr'] == olfr)]
                ax.scatter(subset['umap_x'], subset['umap_y'], 
                        marker=shape, label=_o,
                        s=30,
                        c=manual_color[_odor][1])
                ax.set_title(olfr)
                ax.axis("off")
                ax.set_box_aspect(1)
        else: 
            ax.axis("off")
    ax.legend(loc="best", frameon=False)
    plt.suptitle(f'{_odor} active Olfr pairwise distance')
    plt.subplots_adjust(wspace=0, hspace=0.1)
    # Show plot
    fig.tight_layout()

    plt.savefig(f'../output/Blobel_15045/tsukahara_active/umap/{_odor}_activeOlfr_distance.png', dpi=300) 

In [None]:
# Group the data by 'group' and create histograms
manual_color = {'ACE2h': ['#D3D3D3', '#2ca02c'], 
                'OCT2h': ['#D3D3D3', '#d62728']}


fig, axes = plt.subplots(2, 1, figsize=(6, 12), sharey=True)
for ax, _odor in zip(axes, odors):
    plot_df = distances_dict[_odor]['df'][(distances_dict[_odor]['df'].group.isin(['shuffled']))].copy()
    sns.kdeplot(data=plot_df, x='distance', 
                hue='group', 
                common_norm=False, 
                fill=False, 
                alpha=0.3, linewidth=3, 
                palette = ['#000000'],
                linestyle = '--', 
                ax=ax
    )
    plot_df = distances_dict[_odor]['df'][~(distances_dict[_odor]['df'].group.isin(['shuffled']))].copy()
    plot_df['active_label'] = plot_df.group.apply(lambda x: f'{_odor} Olfr 'if x else 'other Olfr')
    sns.kdeplot(data=plot_df, x='distance', 
                hue='active_label', 
                common_norm=False, 
                fill=False, 
                alpha=0.6, linewidth=5, 
                palette = manual_color[_odor],
                legend=True, 
                ax=ax
    )
    ax.set_xlabel('Pairwise euclidean distance')
    ax.set_ylabel(None)

axes[0].set_ylabel("Density")
sns.despine()
plt.show()

plt.savefig('../output/Blobel_15045/tsukahara_active/active_distance/active_distance_kde.png', dpi=300)


In [None]:
# Euclidean distance box plots 

manual_color = {'ACE2h': ['#2ca02c'], 
                'OCT2h': ['#d62728']}

for _odor in odors: 
    plot_df = distances_dict[_odor]['df'].copy()
    plot_df['active_label'] = plot_df.group.apply(lambda x: f'{_odor} Olfr' if x == True else 'shuffled' if x == 'shuffled' else 'other Olfr')
    plot_df = plot_df.sort_values('active_label')
    fig = px.box(plot_df, 
                    x="active_label", 
                    y="distance", 
                    color = 'active_label',
                    # scalemode='count'
                    #  points="all"
                )
    # fig.update_traces(meanline_visible=True)

    # manually assign color
    for i in range(len(fig.data)): 
        if i == 0: 
            fig.data[i]['marker']['color'] = manual_color[_odor][i]
        else: 
            fig.data[i]['marker']['color'] = '#D3D3D3'

    fig = plot_utils.add_p_value_annotation(fig, [[0,1], [1,2], [0,2]], test_type='ranksums')

    fig.update_layout(
        yaxis = {'title' : 'Pairwise euclidean distance'},
        xaxis = {'title' : ''},
        # title='Pairwise euclidean distance comparison between nostrils(open/close) for each Olfr<br>\
        # <span style="font-size: 10px;"> </span>',
        autosize=True,
        template='simple_white',
        margin=dict(l=50,r=50,b=100,t=150,pad=10),
        font=dict(
            size=15,  # Set the font size here
        )
    )

    fig.show()
    # fig.write_html(f"../output/Blobel_15045/tsukahara_active/active_distance/{_odor}_active_distance_grouped.html")


### Tsukahara All cells MOE

In [None]:
adata = anndata.read_csv('../../Chaperone_Analysis/files/Brann/GSE151346_MOE_all_counts.csv').T
meta = pd.read_csv('../../Chaperone_Analysis/files/Brann/GSE151346_MOE_metadata.tsv', index_col = 0 , sep = '\t')

# Combine metadata 
adata = adata[meta.index]
adata.obs = pd.merge(adata.obs, meta, left_index = True, right_index = True, how = 'left').dropna()
raw_adata = adata.copy()


##### preprocessing... 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

results_file = '../files/Brann_2020/GSE151346_MOE_all_counts.h5ad'  # the file that will store the analysis results

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Visualize the graphs above to decide the cutoff
# adata = adata[adata.obs.pct_counts_mt < 6, :]
# adata = adata[adata.obs.n_genes_by_counts < 2500, :]

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)


In [None]:
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.leiden(adata, resolution = 0.2, random_state=0)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')
sc.tl.umap(adata)

# sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'], size = 5, frameon=False)

In [None]:
sc.pl.umap(adata, color=['Rhbdf1','Rhbdf2', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2'], 
           size = 5, 
           frameon=False, )

In [None]:
adata.write_h5ad(results_file)

#### Umap

In [None]:
results_file = '../files/Brann_2020/GSE151346_MOE_all_counts.h5ad'  # the file that will store the analysis results
adata = anndata.read_h5ad(results_file)
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

# Group and label Rhbdf2 expression groups 
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
# Create a counts df for interested genes 
for _gene in interested_genes:
    umap_df[f'{_gene}_counts'] = adata.raw.X[:, adata.raw.var_names == _gene]
    
umap_df.to_csv('../output/MOE/umap/umap_df.csv')

In [None]:
umap_df = pd.read_csv('../output/MOE/umap/umap_df.csv', index_col = 0)


##### umap figs

In [None]:
"""
ALL cell MOE for fig
"""

plot_df = umap_df.copy()
fig = px.scatter(x = plot_df.umap_x, 
                y = plot_df.umap_y, 
                color = plot_df.leiden_name
                )

fig.update_traces(marker={'size': 12})
manual_color = plot_utils.distinct_colors(umap_df.leiden_name.unique(), category='pastel', random_state=40)
for i, f in enumerate(fig.data):
    f.marker.color = manual_color[f.name]

fig.update_layout(
                #   width=500, 
                height=600,
                plot_bgcolor='rgba(0,0,0,0)',
                xaxis_visible=False,  
                yaxis_visible=False,   
)
fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
)
fig.show()
# fig.write_html('../output/MOE/umap/MOE/umap_celltype.html')
# fig.write_html('../output/fig_image/umap/MOE/umap_celltype.html')

In [None]:
"""
Individual interested genes umap for Fig
"""

interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
for _gene in interested_genes:
    _gene_col = [_col for _col in umap_df.columns if _gene in _col][0]
    plot_df = umap_df.sort_values(_gene_col).copy()
    fig = px.scatter(x = plot_df.umap_x, 
                     y = plot_df.umap_y, 
                     color = plot_df[_gene_col], 
                    #  color_continuous_scale=[(0, '#e1dce4'),(0.5, '#a64ca6'),  (1, '#800080')] # Purple
                     color_continuous_scale=[(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')] # Dark Blue
                    )

    fig.update_traces(marker={'size': 2})
    fig.update_layout(
                    title = f'{_gene} expression',
                    #   width=500, 
                    height=600,
                    plot_bgcolor='rgba(0,0,0,0)',
                    xaxis_visible=False,  
                    yaxis_visible=False,   
    )
    fig.update_yaxes(
        scaleanchor = "x",
        scaleratio = 1,
    )
    fig.show()
    # fig.write_html(f'../output/MOE/umap/{_gene}_MOE.html')
    fig.write_html(f'../output/fig_image/umap/MOE/{_gene}_MOE.html')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
from matplotlib.ticker import ScalarFormatter



custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', [(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')])


# Create subplot with multiple axes
# interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17']

fig, axes = plt.subplots(4, 2, figsize=(12, 20)
                        #  sharex=True, sharey=True
                         )
for ax, gene in zip_longest(axes.flatten(), interested_genes):
    if gene:
        gene_col = [col for col in umap_df.columns if gene in col][0]
        plot_df = umap_df.sort_values(gene_col).copy()

        # Plot scatter plot for each gene
        scatter = ax.scatter(plot_df['umap_x'], plot_df['umap_y'], 
                             c=plot_df[gene_col], cmap=custom_cmap, s=1, 
                             vmin=plot_df[gene_col].min(), 
                             vmax=plot_df[gene_col].max())
                            #  vmax=3)  # Hard define so the count on scale matches iR2, iR1
        ax.set_title(f'{gene} expression')
        ax.axis("off")
        ax.set_aspect('equal')
        
        # Create a ScalarMappable object for each subplot
        sm = ScalarMappable(cmap=custom_cmap, 
                            norm=plt.Normalize(vmin=plot_df[gene_col].min(), 
                                               vmax=plot_df[gene_col].max()))
        sm.set_array([])
        
        # Add colorbar for each subplot with actual value
        cbar = plt.colorbar(sm, ax=ax, label=gene, 
                            ticks=[plot_df[gene_col].min(), 
                                   plot_df[gene_col].median(), 
                                   plot_df[gene_col].max()], 
                             shrink=0.5)
        cbar.ax.set_yticklabels([f'{plot_df[gene_col].min():.0f}', 
                                 f'{plot_df[gene_col].median():.0f}',
                                 f'{plot_df[gene_col].max():.0f}'])
                                #  f'{3:.0f}']) # Hard define so the count on scale matches iR2, iR1

    else: 
        ax.axis("off")

# Adjust layout
plt.suptitle("")
plt.subplots_adjust(wspace=0, hspace=0.1)
# Show plot
fig.tight_layout()

# plt.savefig('../output/fig_image/umap/MOE/umap_activitygenes.png')
# plt.savefig('../output/fig_image/umap/MOE/umap_activitygenes_2.png')


##### umap GO terms

In [None]:
"""
Generates dataframe for which cell types thhe n12 GO_terms are enriched in 
"""

results_file = '../files/Brann_2020/GSE151346_MOE_all_counts.h5ad'  # the file that will store the analysis results
adata = anndata.read_h5ad(results_file)

go_df = pd.read_csv('../output/WTvKO_ALL/GO/GO_terms.csv', index_col=0)
go_df['study_genes'] = go_df['study_genes'].apply(lambda x: literal_eval(x) if "[" in x else x)
go_df = go_df.sort_values(['group','p_corr'], ascending=True).copy()

plot_df = go_df.groupby('group').head(4).reset_index(drop=True)


result_df = pd.DataFrame(columns=['GO Term', 'Associated Genes', 'Most Abundant Cell Type'])
aggregate_expression_df = pd.DataFrame(index=adata.obs.index)

# Iterate over each GO term in your go_df
for index, row in plot_df.iterrows():
    go_term = row['term']
    associated_genes = row['study_genes']

    # Initialize a dictionary to store cell type counts for the current GO term
    cell_type_counts = {}
    genes_in_adata = []
    expression_scores = np.zeros(len(adata.obs))

    # Iterate over each gene associated with the current GO term
    for _gene in associated_genes:
        # Get the gene index
        if _gene in adata.raw.var.index:
            
            genes_in_adata += [_gene]
            
            gene_index = adata.raw.var.index.get_loc(_gene)
            # Calculate the threshold for the top 10% of expression for this gene
            expression_threshold = np.percentile(adata.raw.X[:, gene_index], 90)
            # Get the indices of cells with expression above the threshold for this gene
            high_expression_cell_indices = np.where(adata.raw.X[:, gene_index] > expression_threshold)[0]
            # Get the cell types corresponding to these high expression cells
            high_expression_cell_types = adata.obs.iloc[high_expression_cell_indices]['leiden_name']
            # Count the occurrences of each cell type
            for cell_type in high_expression_cell_types:
                if cell_type not in cell_type_counts:
                    cell_type_counts[cell_type] = 1
                else:
                    cell_type_counts[cell_type] += 1
                    
            # Calculate aggregated exppression
            gene_expression = adata.raw.X[:, gene_index]
            expression_scores += gene_expression
            
    # Determine the top 3 most abundant cell types for the current GO term
    sorted_cell_types = sorted(cell_type_counts.items(), key=lambda x: x[1], reverse=True)
    most_abundant_cell_types = [cell_type for cell_type, _ in sorted_cell_types[:3]]
    
    aggregate_expression_df[go_term] = expression_scores

    # Append the results to the DataFrame
    result_df = result_df.append({'GO Term': go_term,
                                'Associated Genes': ', '.join(genes_in_adata),
                                'Most Abundant Cell Type': ', '.join(most_abundant_cell_types)},
                                ignore_index=True)

    

In [None]:
result_df

In [None]:
"""
GO_term celltype for fig
"""

plot_df = aggregate_expression_df.copy()
plot_df['umap_x'] = umap_df['umap_x']
plot_df['umap_y'] = umap_df['umap_y']
plot_df['cell_type'] = umap_df['leiden_name']

for _term in aggregate_expression_df.columns:
    
    plot_df = plot_df.sort_values(_term)
    fig = px.scatter(x = plot_df.umap_x, 
                     y = plot_df.umap_y, 
                     color = plot_df[_term], 
                    #  color_continuous_scale=[(0, '#e1dce4'),(0.5, '#a64ca6'),  (1, '#800080')] # Purple
                     color_continuous_scale=[(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')] # Dark Blue
                    )

    fig.update_traces(marker={'size': 3})
    
    fig.update_layout(
                    title = f'{_term}',
                    #   width=500, 
                    height=600,
                    plot_bgcolor='rgba(0,0,0,0)',
                    xaxis_visible=False,  
                    yaxis_visible=False,   
    )
    fig.update_yaxes(
        scaleanchor = "x",
        scaleratio = 1,
    )
    fig.show()
    # fig.write_html(f'../output/MOE/umap/{_gene}_MOE.html')
    # fig.write_html(f'../output/fig_image/umap/MOE/{_gene}_MOE.html')

In [None]:
"""
for Fig. plots individual go_term's aggregated exppression and labels the cell types 

"""

plot_df = aggregate_expression_df.copy()
plot_df['umap_x'] = umap_df['umap_x']
plot_df['umap_y'] = umap_df['umap_y']
plot_df['cell_type'] = umap_df['leiden_name']


# Initialize an empty DataFrame to store the percentage of positive expression cells within each cell type
percentage_expressing_by_cell_type = pd.DataFrame(index=aggregate_expression_df.columns, columns=adata.obs['leiden_name'].unique())
# Iterate over each GO term
for go_term in aggregate_expression_df.columns:
    positive_cells = aggregate_expression_df[go_term] > np.percentile(aggregate_expression_df[go_term], 99)
    positive_cell_types = adata.obs.loc[positive_cells, 'leiden_name']
    positive_cell_counts = positive_cell_types.value_counts()
    percentage_expressing_by_cell_type.loc[go_term] = positive_cell_counts / positive_cell_counts.sum() * 100
percentage_expressing_by_cell_type = percentage_expressing_by_cell_type.T

# Iterate over each GO term
for _term in aggregate_expression_df.columns:
    plot_df = plot_df.sort_values(_term)
    # Plot the UMAP
    fig = px.scatter(x=plot_df.umap_x, 
                     y=plot_df.umap_y, 
                     color=plot_df[_term],
                     color_continuous_scale=[(0, '#DBE5EB'),(0.5, '#67879B'),  (1, '#073763')]
                     )

    # Update trace marker size
    fig.update_traces(marker={'size': 4})

    # Keep track of plotted cell types
    plotted_cell_types = set()

    # Add cell type labels for cells with aggregate expression
    for i, _cell_pct in enumerate(percentage_expressing_by_cell_type[_term]):
        if _cell_pct > 10: 
            cell_type = percentage_expressing_by_cell_type.index[i]
            if cell_type not in plotted_cell_types:
                cell_type_cells = adata.obs['leiden_name'] == cell_type
                mean_umap_x = plot_df[cell_type_cells].umap_x.min() * 1.2
                mean_umap_y = plot_df[cell_type_cells].umap_y.mean()
                fig.add_annotation(x=mean_umap_x, y=mean_umap_y, text=cell_type, showarrow=False, 
                                font=dict(size=20, color='black', family='Arial'))
                plotted_cell_types.add(cell_type)

    # Update layout
    fig.update_layout(
        title=f'{_term} expression',
        height=600,
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis_visible=False,  
        yaxis_visible=False,   
    )
    fig.update_yaxes(
        scaleanchor="x",
        scaleratio=1,
    )
    fig.show()

In [None]:
[plot_df[cell_type_cells].umap_x.mean()]

### ...