In [None]:
import os 
import numpy as np 
import pandas as pd 
import scanpy as sc
import anndata
import copy
import desc
import matplotlib.pyplot as pl
# following line ensures the pl plots inline 
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from scipy import stats

import utils.DE_plotting_tools as plot_utils

In [None]:
import importlib
importlib.reload(plot_utils)

The goal of this notebook is to investigate the expression of Rhbdf2 expression in OSN. <br>
More specifically, if there is a bias in expression of Rhbdf2 in different ORs. 

In [None]:
adata = desc.read_h5ad('../../Chaperone_Analysis/files/Tsukahara/GSE173947_home_cage_matureOSN.h5ad')

# Generate meta file of top expressing Olfrs from each cell and add that into adata.obs
meta_mOSN = pd.read_csv('../../Chaperone_Analysis/files/Tsukahara/GSE173947_home_cage_metadata.csv')[['Unnamed: 0','top_Olfr']]
meta_mOSN.rename(columns={'Unnamed: 0':'index'}, inplace=True)
meta_mOSN = meta_mOSN.set_index('index')
adata.obs = adata.obs.join(meta_mOSN)

In [None]:
sc.set_figure_params(facecolor='white', color_map = 'viridis')
sc.pl.umap(adata, color=['Omp', 'Rhbdf2','leiden'], 
           legend_loc='on data', size=30)

Since the expression of Rhbdf2 is pretty sparse and non-clustered within the mOSN population. <br> 
Seperate mOSN cells that have Rhbdf2 expression, and find differential gene expression between Rhbdf2 expressing and non-expressing population 

In [None]:
# Extract meta with umap coordinates 
umap_df = adata.obs.copy()
x, y = zip(*adata.obsm['X_umap'])
umap_df['umap_x'] = x
umap_df['umap_y'] = y 

# Group and label Rhbdf2 expression groups 
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
# Create a counts df for interested genes 
for _gene in interested_genes:
    umap_df[f'{_gene}_counts'] = adata.raw.X[:, adata.raw.var_names == _gene]

umap_df.to_csv('../output/mOSN_Rhbdf2/umap/umap_df.csv')

In [None]:
Olfr_Rhbdf2_df = umap_df[['top_Olfr', 'Rhbdf2_counts']]
# Generate Olfr's osn counts, rhbdf2 sum and mean across ORs for visualization of wehre to cutoff
n_osn = pd.Series(Olfr_Rhbdf2_df.groupby('top_Olfr').size(), name='n_osn')
Rhbdf2_sum = Olfr_Rhbdf2_df.groupby('top_Olfr').agg(Rhbdf2_sum = ('Rhbdf2_counts', np.sum))
Rhbdf2_mean = Olfr_Rhbdf2_df.groupby('top_Olfr').agg(Rhbdf2_mean = ('Rhbdf2_counts', np.mean))
Olfr_Rhbdf2_df = pd.concat([n_osn, Rhbdf2_sum, Rhbdf2_mean],
          axis=1).sort_values('Rhbdf2_sum', ascending=False)
Olfr_Rhbdf2_df = Olfr_Rhbdf2_df.reset_index()

In [None]:
fig = px.scatter(Olfr_Rhbdf2_df.melt(id_vars='top_Olfr'), 
                 x = 'top_Olfr', 
                 y = 'value', 
                 color = 'variable',
                 title = "Rhbdf2 expression by receptor", 
                category_orders={"top_Olfr" : Olfr_Rhbdf2_df.sort_values('Rhbdf2_mean', ascending=False).top_Olfr})
fig.show()

# fig.write_html('./output/plots/Rfhbdf2_ORexpression2.html')

Since the expression counts of Rhbdf2 seems to be correlated to number of cells expressing the OR. We'll choose a cutoff of 20 counts of Rhbdf2 expression compare to the other cells. 

In [None]:
# Add Rhbdf2 information to main adata obs 
Olfr_Rhbdf2_df = Olfr_Rhbdf2_df.set_index('top_Olfr')

for Or in Olfr_Rhbdf2_df.index.unique():
    adata.obs.loc[adata.obs.top_Olfr.str.contains(Or), 
        list(Olfr_Rhbdf2_df.columns)] = Olfr_Rhbdf2_df[Olfr_Rhbdf2_df.index == Or].values
    
# Olfr_Rhbdf2_df.to_csv('./output/Rhbdf2_expression.csv')

In [None]:
adata.obs['Rhbdf2_counts'] = adata.raw.X[:,adata.raw.var_names == 'Rhbdf2']
# Labels specific cells with Rhbdf2 counts more than 3 as a seperate cluster 
adata.obs.loc[adata.obs['Rhbdf2_counts'] < 1, 'Rhbdf2_cell' ] = "Rhbfd2_negative_cells"
adata.obs.loc[adata.obs['Rhbdf2_counts'] >= 1, 'Rhbdf2_cell' ] = "Rhbfd2_positive_cells"
# Labels specific OR expressin cells with different clusters
adata.obs.loc[adata.obs['Rhbdf2_sum'] < 1, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr < 1"
# adata.obs.loc[adata.obs['Rhbdf2_sum'] >= 10, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr < 20"
adata.obs.loc[adata.obs['Rhbdf2_sum'] >= 1, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr >= 1"

In [None]:
sc.tl.rank_genes_groups(adata, 'Rhbdf2_cell', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names', 'pvals']}).to_csv('../output/mOSN_Rhbdf2/rgg/Rhbdf2_byExpression.csv')

In [None]:
sc.tl.rank_genes_groups(adata, 'Rhbdf2_Olfr', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names', 'pvals']}).to_csv('../output/mOSN_Rhbdf2/rgg/Rhbdf2_byOlfr.csv')

Since S100A5 and Rhbdf2 seems to be negatively correlated with each other. Plot their expression counts for individual cells to visualize. 

In [None]:
# Find the interested genes in contained in adata. 
# Additionally spits out the order in which the genes are stored in matrix 
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
count_df = pd.DataFrame(adata.raw.X[:, adata.raw.var_names.isin(interested_genes)], 
            columns=[i+"_counts" for i in interested_genes])
count_df.insert(0, 'Olfr', adata.obs['top_Olfr'].values)
# Save Rhbdf2 expression file 
count_df.to_csv('../output/mOSN_Rhbdf2/Interested_gene_counts.csv')
count_df = pd.read_csv('../output/mOSN_Rhbdf2/Interested_gene_counts.csv', index_col=0)

In [None]:
# This line simply moves 'Rhbdf2' to the beginning of the list for plotting visual 
interested_genes.insert(0, interested_genes.pop(interested_genes.index('Rhbdf2')))

sc.set_figure_params(facecolor='white', color_map = 'viridis')
sc.pl.umap(adata, color=interested_genes,
           legend_loc='on data', size=30)

In [None]:
"""
for Fig. 
Rhbdf2 v S100a5 correlation 
"""

from scipy.stats import pearsonr

umap_df = pd.read_csv('../output/mOSN_Rhbdf2/umap/umap_df.csv')
plot_df = umap_df[['top_Olfr', 'Rhbdf2_counts', 'S100a5_counts']].copy()
plot_df = plot_df[plot_df['Rhbdf2_counts'] < 5]

fig = go.Figure()
manual_color = plot_utils.continuous_colors(plot_df['Rhbdf2_counts'].sort_values().unique(), 
                                            custom_color =  [(0, '#B3C3CD'),(0.5, '#67879B'),  (1, '#073763')])
for _rhbdf2 in plot_df['Rhbdf2_counts'].unique(): 
   subset = plot_df[plot_df['Rhbdf2_counts'] == _rhbdf2]
   fig.add_trace(go.Violin(x = subset['Rhbdf2_counts'], 
                           y = subset['S100a5_counts'], 
                           points=False, opacity =0.5, 
                           fillcolor = manual_color[_rhbdf2], 
                           meanline_visible=True
                           # points='all', pointpos=0
                  )
               )
   fig.add_trace(go.Box(x = subset['Rhbdf2_counts'], 
                           y = subset['S100a5_counts'], 
                           boxpoints='all', pointpos=0,
                           opacity=0.5,
                           line = dict(color = 'rgba(0,0,0,0)'),
                           fillcolor = 'rgba(0,0,0,0)'
                  )
               )
# fig = plot_utils.downsample_fig(fig, max_points = 500, sample_method='linspace')
for i, _data in enumerate(fig.data):
   fig.data[i]['marker'] = dict(color = manual_color[_data['x'][0]])

# Add correlation
p_corr, p = pearsonr(plot_df['Rhbdf2_counts'], plot_df['S100a5_counts'])
coefficients = np.polyfit(plot_df['Rhbdf2_counts'], plot_df['S100a5_counts'], 1)
trend_line = np.poly1d(coefficients)

# Add the trend line
x_values = min(plot_df['Rhbdf2_counts']), max(plot_df['Rhbdf2_counts'])
y_values = trend_line(x_values)
fig.add_trace(go.Scatter(x=x_values, y=y_values, 
                         mode='lines', 
                         opacity = 0.5,
                         line=dict(color='black', width=5, dash='dot'),
                         name=f'Trend Line (p_corr={p_corr:.2f})<br>p={p:.2f}'))


fig.update_layout(title = f'Rhbdf2 vs S100a5 expression',
                  template='simple_white',
                  xaxis_title = 'Rhbdf2 expression', 
                  yaxis_title = 'S100a5 expression'   
)
fig.show()
# fig.write_html('../output/mOSN_Rhbdf2/Rhbdf2_S100a5_correlation.html')
fig.write_html('../output/fig_image/Violin/Rhbdf2_S100a5_correlation.html')

Bins Rhbdf2 counts into box groups. 

In [None]:
count_df = pd.read_csv('../output/mOSN_Rhbdf2/Interested_gene_counts.csv', index_col=0)
count_df['Rhbdf2_box_groups'] = count_df['Rhbdf2_counts'].apply(int).apply(str)
count_df.loc[count_df['Rhbdf2_box_groups'].str.contains("3|4|5"), 'Rhbdf2_box_groups'] = "3+"

count_df['Rhbdf2_present'] = count_df['Rhbdf2_counts'].apply(int).apply(str)
count_df['Rhbdf2_present'] = count_df['Rhbdf2_present'].apply(lambda x: 'Rhbdf2-' if x == '0' else 'Rhbdf2+')


In [None]:
"""
plot for Fig. 

activity genes between Rhbdf2+ Rhbdf2- cells
"""

interested_genes = ['S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']

# Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
y_column = [_col for _gene in interested_genes for _col in y_column if _gene in _col ]


# manual_color = plot_utils.continuous_colors(count_df['Rhbdf2_present'].unique(), 
#                                             custom_color=[(0, '#B3C3CD'), (1, '#073763')])
manual_color = plot_utils.distinct_colors(count_df['Rhbdf2_present'].unique(), 
                                            custom_color=['#B3C3CD', 
                                                          '#073763'])

# Create subplots
num_rows = 1
num_cols = len(y_column)
fig_combined = make_subplots(rows=num_rows, cols=num_cols)
combined_annotations = []

for _col_i, _gene in enumerate(y_column, start=1): 
    fig = go.Figure()
    gene_stats = {}
    for _rhbdf2 in count_df['Rhbdf2_present'].unique(): 
        subset = count_df[(count_df['Rhbdf2_present'] == _rhbdf2)][[_gene, 'Rhbdf2_present']]
        
        side = 'negative' if _rhbdf2 == 'Rhbdf2-' else 'positive'
        # group_str =  _gene.split('_')[0] + '_' + _rhbdf2
        group_str =  _gene.split('_')[0]
        fig.add_trace(go.Violin(y = subset[_gene], 
                                name = group_str,
                                side=side, 
                                points=False, 
                                opacity =0.95, 
                                fillcolor = manual_color[_rhbdf2],
                                meanline_visible=True
                        )
                    )
        _pos = 0.5 if side == 'positive' else -0.5
        fig.add_trace(go.Box(y = subset[_gene], 
                                name = group_str,
                                boxpoints='all', pointpos=_pos,
                                opacity=0.5,
                                line = dict(color = 'rgba(0,0,0,0)'),
                                fillcolor = 'rgba(0,0,0,0)'
                        )
                    )
    # Manually do stats between points and store in gene_stats dict 
    gene_stats[_gene] = stats.ranksums(
                count_df[(count_df['Rhbdf2_present'] == 'Rhbdf2-')][_gene].sample(1000, random_state = 0),
                count_df[(count_df['Rhbdf2_present'] == 'Rhbdf2+')][_gene].sample(1000, random_state = 0)
            )


    fig = plot_utils.add_p_value_annotation(fig, [[0,0]], 
                                            just_annotate=[plot_utils.format_pvalue(gene_stats[_gene][1], 
                                                                                    t=gene_stats[_gene][0])])

    fig = plot_utils.downsample_fig(fig, max_points = 1000)
    for i, _data in enumerate(fig.data):
        if i % 2 == 0: 
            fig.data[i]['marker'] = dict(color = manual_color['Rhbdf2-'] if _data['side'] == 'negative' else manual_color['Rhbdf2+'])
            fig.data[i]['meanline'] = dict(color = manual_color['Rhbdf2+'] if _data['side'] == 'negative' else manual_color['Rhbdf2-'])
        else: 
            fig.data[i]['marker'] = dict(color = manual_color['Rhbdf2-'] if _data['pointpos'] < 0 else manual_color['Rhbdf2+'], opacity=0.3)
    
    # Transfer annotations to combined_annotations list
    combined_annotations.extend([dict(xref=f'x{_col_i}', yref=f'y{_col_i}',
                                      text=ann.text, 
                                      showarrow=ann.showarrow, arrowhead=ann.arrowhead, 
                                      x=ann.x, y= 1.3* max([max(_d.y) for _d in fig.data])) for ann in fig.layout.annotations])
    
    # Add traces
    for f in range(len(fig.data)):
        fig.data[f].update(showlegend=False)
        fig_combined.add_trace(fig.data[f], row=1, col=_col_i)
        
# Add transferred annotations to the combined figure
for ann in combined_annotations:
    fig_combined.add_annotation(**ann)
    
fig_combined.update_layout(
                title = f'Activity genes in iRhom2 present / negative cells',
                template='simple_white',
                yaxis_title = 'Activity gene expression',
                # margin=dict(l=50,r=50,b=100,t=100,pad=10)
)
fig_combined.show()
# fig_combined.write_html('../output/mOSN_Rhbdf2/Activitygenes_Rhbdf2Present.html')
fig_combined.write_html('../output/fig_image/Violin/Activitygenes_Rhbdf2Present_2.html')

In [None]:
"""
plot for Fig. 

activity genes between Rhbdf2+ Rhbdf2- cells
"""

interested_genes = ['S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']

# Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
y_column = [_col for _gene in interested_genes for _col in y_column if _gene in _col ]


fig = go.Figure()
manual_color = plot_utils.continuous_colors(count_df['Rhbdf2_present'].unique(), 
                                            custom_color=[(0, '#B3C3CD'), (1, '#073763')])

gene_stats = {}
for _gene in y_column: 
    for _rhbdf2 in count_df['Rhbdf2_present'].unique(): 
        subset = count_df[(count_df['Rhbdf2_present'] == _rhbdf2)][[_gene, 'Rhbdf2_present']]
        
        # ps_constant = 1
        # for _count_col in subset.columns[subset.columns.str.contains('counts')]: 
        #     if 'Rhbdf2' not in _count_col: 
        #         subset[_count_col] = np.log(subset[_count_col] + ps_constant)
        
        side = 'negative' if _rhbdf2 == 'Rhbdf2-' else 'positive'
        # group_str =  _gene.split('_')[0] + '_' + _rhbdf2
        group_str =  _gene.split('_')[0]
        fig.add_trace(go.Violin(y = subset[_gene], 
                                name = group_str,
                                side=side, 
                                points=False, opacity =0.8, 
                                fillcolor = manual_color[_rhbdf2]
                                # points='all', pointpos=0
                        )
                    )
        _pos = 0.5 if side == 'positive' else -0.5
        fig.add_trace(go.Box(y = subset[_gene], 
                                name = group_str,
                                boxpoints='all', pointpos=_pos,
                                opacity=0.5,
                                line = dict(color = 'rgba(0,0,0,0)'),
                                fillcolor = 'rgba(0,0,0,0)'
                        )
                    )
    # Manually do stats between points and store in gene_stats dict 
    gene_stats[_gene] = stats.ranksums(
                count_df[(count_df['Rhbdf2_present'] == 'Rhbdf2+')][_gene].sample(1000, random_state = 0),
                count_df[(count_df['Rhbdf2_present'] == 'Rhbdf2-')][_gene].sample(1000, random_state = 0)
            )

# Manually add stats by predefined p from gene_stats dict
for i, stat in enumerate(gene_stats):
    fig = plot_utils.add_p_value_annotation(fig, [[i,i]], 
                                            just_annotate=[plot_utils.format_pvalue(gene_stats[stat][1], 
                                                                                   t=gene_stats[stat][0])])

fig = plot_utils.downsample_fig(fig, max_points = 1000)
for i, _data in enumerate(fig.data):
    if i % 2 == 0: 
        fig.data[i]['marker'] = dict(color = manual_color['Rhbdf2-'] if _data['side'][0] == 'negative' else manual_color['Rhbdf2+'])
    else: 
        fig.data[i]['marker'] = dict(color = manual_color['Rhbdf2-'] if _data['pointpos'] < 0 else manual_color['Rhbdf2+'])


fig.update_layout(
                title = f'Activity genes in Rhbdf2 present / negative cells',
                template='simple_white',
                yaxis_title = 'Activity gene expression',
                margin=dict(l=50,r=50,b=100,t=200,pad=10)
)
fig.show()
# fig.write_html('../output/mOSN_Rhbdf2/Activitygenes_Rhbdf2Present.html')
# fig.write_html('../output/fig_image/Violin/Activitygenes_Rhbdf2Present.html')

In [None]:
# DEPRECATED

#  # Quick plot to see how the other interested genes correslate with Rhbdf2
# y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
# plot_df = pd.melt(count_df, id_vars=['Olfr','Rhbdf2_box_groups'], value_vars=y_column)

# fig = px.box(plot_df, 
#              x = 'variable', 
#              y = 'value', 
#              color = 'Rhbdf2_box_groups',
#              notched = True,
# #              points='all',
#              title = "Relative Expression of genes to Rhbdf2"
#             )
# fig.update_layout(
#         margin=dict(
#             l=50, r=50, b=100, t=100,pad=10
#         ),
#         template='plotly_white'
#     )
# fig.show()
# # fig.write_html('./output/plots/Rhbdf2_grouped_box.html')

#### umap 

In [None]:
umap_df = pd.read_csv('../output/mOSN_Rhbdf2/umap/umap_df.csv', index_col = 0)

In [None]:
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
from matplotlib.ticker import ScalarFormatter
import matplotlib.pyplot as plt
from itertools import zip_longest


# custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', [(0, '#DBE5EB'),
#                                                                 # (0.5, '#67879B'),  
#                                                                 (1, '#073763')])
custom_cmap = 'viridis'

# Create subplot with multiple axes
interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5', 'Dlg2', 'Lrrc3b', 'Pcp4l1', 'Kirrel2']
# interested_genes = ['Rhbdf1', 'Rhbdf2', 'Adam17', 'S100a5']

fig, axes = plt.subplots(4, 2, figsize=(12, 15)
                        #  sharex=True, sharey=True
                         )
for ax, gene in zip_longest(axes.flatten(), interested_genes):
    if gene:
        gene_col = [col for col in umap_df.columns if gene in col]
        gene_col = [col for col in gene_col if 'counts' in col][0]

        plot_df = umap_df.sort_values(gene_col).copy()

        # Plot scatter plot for each gene
        scatter = ax.scatter(plot_df['umap_x'], plot_df['umap_y'], 
                             c=plot_df[gene_col], 
                             cmap=custom_cmap, s=5, 
                             vmin=plot_df[gene_col].min(), 
                             vmax=plot_df[gene_col].max())
                            #  vmax=3)  # Hard define so the count on scale matches iR2, iR1
        ax.set_title(f'{gene} expression')
        ax.axis("off")
        ax.set_aspect('equal')
        
        # Create a ScalarMappable object for each subplot
        sm = ScalarMappable(cmap=custom_cmap, 
                            norm=plt.Normalize(vmin=plot_df[gene_col].min(), 
                                               vmax=plot_df[gene_col].max()))
        sm.set_array([])
        
        # Add colorbar for each subplot with actual value
        cbar = plt.colorbar(sm, ax=ax, label=gene, 
                            ticks=[plot_df[gene_col].min(), 
                                   plot_df[gene_col].median(), 
                                   plot_df[gene_col].max()], 
                             shrink=0.5)
        cbar.ax.set_yticklabels([f'{plot_df[gene_col].min():.0f}', 
                                 f'{plot_df[gene_col].median():.0f}',
                                 f'{plot_df[gene_col].max():.0f}'])
                                #  f'{3:.0f}']) # Hard define so the count on scale matches iR2, iR1

    else: 
        ax.axis("off")

# Adjust layout
plt.suptitle("")
plt.subplots_adjust(wspace=0, hspace=0.1)
# Show plot
fig.tight_layout()

# plt.savefig('../output/mOSN_Rhbdf2/umap/umap_activitygenes_2.png')
# plt.savefig('../output/fig_image/umap/mOSN_Rhbdf2/umap_activitygenes_2.png')

#### Deprecated

Transforms count dataframe into log data frame and save. 

In [None]:
# DEPRECATED

# # Log transform counts

# log1p_df = pd.DataFrame(count_df[['Olfr', 'Rhbdf2_box_groups', 'Rhbdf2_present']])

# for i in list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts'])):
#     log1p_df[i.split('_')[0]+'_log1p'] = np.log1p(count_df[i])
# # Replace -inf values to 0
# # count_df.loc[count_df['log(S100a5_counts)'] == -np.inf,'log(S100a5_counts)'] = 0

# # Save count_df 
# # log1p_df.to_csv('./Interested_gene_log1p.csv')

In [None]:
# DEPRECATED


#  # Quick plot to see how the other interested genes correslate with Rhbdf2
# y_column = list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')])
# plot_df = pd.melt(log1p_df, id_vars=['Olfr','Rhbdf2_present'], value_vars=y_column)

# fig = px.box(plot_df, 
#              x = 'variable', 
#              y = 'value', 
#              color = 'Rhbdf2_present',
#              notched = True,
# #              points='all',
#              animation_group='variable',
# #              boxmode='group',
#              title = "log1p Expression of genes to presence of Rhbdf2"
#             )
# fig.update_layout(
#         margin=dict(
#             l=50, r=50, b=100, t=100,pad=10
#         ),
#         template='plotly_white'
#     )
# fig.show()
# fig.write_html('./output/plots/Rhbdf2_log1p_box.html')

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')])
plot_df = pd.melt(log1p_df, id_vars=['Olfr','Rhbdf2_box_groups'], value_vars=y_column)

fig = px.box(plot_df, 
             x = 'variable', 
             y = 'value', 
             color = 'Rhbdf2_box_groups',
             notched = True,
#              points='all',
             animation_group='variable',
#              boxmode='group',
             title = "log1p Expression of genes to Rhbdf2"
            )
fig.update_layout(
        margin=dict(
            l=50, r=50, b=100, t=100,pad=10
        ),
        template='plotly_white'
    )
fig.show()
fig.write_html('./output/plots/Rhbdf2_grouped_log1p_box.html')

In [None]:
# DEPRECATED

# for i in list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')]):
#     fig = px.box(log1p_df, 
#                  x = 'Rhbdf2_box_groups', 
#                  y = i, 
#                  color = 'Rhbdf2_box_groups',
#                  notched = True,
#                  title = "log1p Expression of Rhbdf2 and " + i.split('_')[0]
#                 )
#     fig = add_p_value_annotation(fig, [[0,1], [0,2], [0,3]])
# #     fig = add_p_value_annotation(fig, [[0,1]])
#     fig.update_layout(
#         autosize=False,
#         width=500,
#         height=400,
#         margin=dict(
#             l=50,
#             r=50,
#             b=100,
#             t=100,
#             pad=10
#         ),
#     #     paper_bgcolor='rgba(0,0,0,0)',
#     #     plot_bgcolor='rgba(0,0,0,0)',
#         template='plotly_white'
#     )
#     fig.show()
#     fig.write_html('./output/plots/Individual_box/Rhbdf2_grouped_' + i.split('_')[0] + '.html')

In [None]:
# Does t-test to see significance between box groups 

import itertools

box_groups = count_df['Rhbdf2_box_groups'].unique()
# Generate all the combinations of two groups
combinations = itertools.combinations(box_groups, 2)
# for combination in combinations:
#     print(combination)

# Initialise a list of combinations of groups that are significantly different
significant_combinations = []
for combination in combinations:
    data1 = count_df[count_df['Rhbdf2_box_groups'].isin([combination[0]])]['log1p(S100a5_counts)']
    data2 = count_df[count_df['Rhbdf2_box_groups'].isin([combination[1]])]['log1p(S100a5_counts)']
    # Significance
    U, p = stats.mannwhitneyu(data1, data2, alternative='two-sided')
    if p < 0.05:
        significant_combinations.append([combination, p])
        
significant_combinations