In [None]:
import os 
import numpy as np 
import pandas as pd 
import scanpy as sc
import anndata
import copy
import desc
import matplotlib.pyplot as pl
# following line ensures the pl plots inline 
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error
from scipy import stats

The goal of this notebook is to investigate the expression of Rhbdf2 expression in OSN. <br>
More specifically, if there is a bias in expression of Rhbdf2 in different ORs. 

In [None]:
adata = desc.read_h5ad('../Chaperone_Analysis/files/Tsukahara/GSE173947_home_cage_matureOSN.h5ad')

In [None]:
# Generate meta file of top expressing Olfrs from each cell and add that into adata.obs
meta_mOSN = pd.read_csv('../Chaperone_Analysis/files/Tsukahara/GSE173947_home_cage_metadata.csv')[['Unnamed: 0','top_Olfr']]
meta_mOSN.rename(columns={'Unnamed: 0':'index'}, inplace=True)
meta_mOSN = meta_mOSN.set_index('index')
adata.obs = adata.obs.join(meta_mOSN)

In [None]:
sc.set_figure_params(facecolor='white', color_map = 'viridis')
sc.pl.umap(adata, color=['Omp', 'Rhbdf2','leiden'], 
           legend_loc='on data', size=30)

Since the expression of Rhbdf2 is pretty sparse and non-clustered within the mOSN population. <br> 
Seperate mOSN cells that have Rhbdf2 expression, and find differential gene expression between Rhbdf2 expressing and non-expressing population 

In [None]:
adata.obs['Rhbdf2_counts'] = adata.raw.X[:, adata.raw.var_names.isin(['Rhbdf2'])]

In [None]:
Olfr_Rhbdf2_df = adata.obs[['top_Olfr', 'Rhbdf2_counts']]
# Generate Olfr's osn counts, rhbdf2 sum and mean across ORs for visualization of wehre to cutoff
n_osn = pd.Series(Olfr_Rhbdf2_df.groupby('top_Olfr').size(), name='n_osn')
Rhbdf2_sum = Olfr_Rhbdf2_df.groupby('top_Olfr').agg(Rhbdf2_sum = ('Rhbdf2_counts', np.sum))
Rhbdf2_mean = Olfr_Rhbdf2_df.groupby('top_Olfr').agg(Rhbdf2_mean = ('Rhbdf2_counts', np.mean))
Olfr_Rhbdf2_df = pd.concat([n_osn, Rhbdf2_sum, Rhbdf2_mean],
          axis=1).sort_values('Rhbdf2_sum', ascending=False)
Olfr_Rhbdf2_df = Olfr_Rhbdf2_df.reset_index()

In [None]:
fig = px.scatter(Olfr_Rhbdf2_df.melt(id_vars='top_Olfr'), 
                 x = 'top_Olfr', 
                 y = 'value', 
                 color = 'variable',
                 title = "Rhbdf2 expression by receptor", 
                category_orders={"top_Olfr" : Olfr_Rhbdf2_df.sort_values('Rhbdf2_mean', ascending=False).top_Olfr})
fig.show()

# fig.write_html('./output/plots/Rfhbdf2_ORexpression2.html')

Since the expression counts of Rhbdf2 seems to be correlated to number of cells expressing the OR. We'll choose a cutoff of 20 counts of Rhbdf2 expression compare to the other cells. 

In [None]:
# Add Rhbdf2 information to main adata obs 
Olfr_Rhbdf2_df = Olfr_Rhbdf2_df.set_index('top_Olfr')

for Or in Olfr_Rhbdf2_df.index.unique():
    adata.obs.loc[adata.obs.top_Olfr.str.contains(Or), 
        list(Olfr_Rhbdf2_df.columns)] = Olfr_Rhbdf2_df[Olfr_Rhbdf2_df.index == Or].values
    
# Olfr_Rhbdf2_df.to_csv('./output/Rhbdf2_expression.csv')

In [None]:
Olfr_Rhbdf2_df.iloc[Olfr_Rhbdf2_df.index.str.contains('Olfr910|Olfr912|Olfr1295')]

In [None]:
# Labels specific cells with Rhbdf2 counts more than 3 as a seperate cluster 
adata.obs.loc[adata.obs['Rhbdf2_counts'] < 1, 'Rhbdf2_cell' ] = "Rhbfd2_negative_cells"
adata.obs.loc[adata.obs['Rhbdf2_counts'] >= 1, 'Rhbdf2_cell' ] = "Rhbfd2_positive_cells"
# Labels specific OR expressin cells with different clusters
adata.obs.loc[adata.obs['Rhbdf2_sum'] < 1, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr < 1"
# adata.obs.loc[adata.obs['Rhbdf2_sum'] >= 10, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr < 20"
adata.obs.loc[adata.obs['Rhbdf2_sum'] >= 1, 'Rhbdf2_Olfr' ] = "Rhbfd2_sum_Olfr >= 1"

In [None]:
sc.tl.rank_genes_groups(adata, 'Rhbdf2_cell', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names', 'pvals']}).to_csv('./Rhbdf2_byExpression.csv')

In [None]:
sc.tl.rank_genes_groups(adata, 'Rhbdf2_Olfr', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names', 'pvals']}).to_csv('./Rhbdf2_byOlfr.csv')

Since S100A5 and Rhbdf2 seems to be negatively correlated with each other. Plot their expression counts for individual cells to visualize. 

In [None]:
# Find the interested genes in contained in adata. 
# Additionally spits out the order in which the genes are stored in matrix 
interested_genes = ['Rhbdf2', 'S100a5', 'Lrrc3b', 'Kirrel2','Hbegf', 'Areg'
                   , 'Adam17', 'Tgfa', 'Epgn', 'Cx3cl1', 'Bcl2']
interested_genes = list(adata.raw.var_names[adata.raw.var_names.isin(interested_genes)])
count_df = pd.DataFrame(adata.raw.X[:, adata.raw.var_names.isin(interested_genes)], 
            columns=[i+"_counts" for i in interested_genes])
count_df.insert(0, 'Olfr', adata.obs['top_Olfr'].values)
# Save Rhbdf2 expression file 
count_df.to_csv('./output/Interested_gene_counts.csv')
count_df = pd.read_csv('./output/Interested_gene_counts.csv', index_col=0)

In [None]:
# This line simply moves 'Rhbdf2' to the beginning of the list for plotting visual 
interested_genes.insert(0, interested_genes.pop(interested_genes.index('Rhbdf2')))

sc.set_figure_params(facecolor='white', color_map = 'viridis')
sc.pl.umap(adata, color=interested_genes,
           legend_loc='on data', size=30)

In [None]:
fig = px.scatter(count_df, 
                 x = 'Rhbdf2_counts', 
                 y = 'S100a5_counts', 
                color='Olfr')
fig.show()
# fig.write_html('./output/Rhbdf2_S100a5_correlation.html')

Bins Rhbdf2 counts into box groups. 

In [None]:
count_df['Rhbdf2_box_groups'] = count_df['Rhbdf2_counts'].apply(int).apply(str)
count_df.loc[count_df['Rhbdf2_box_groups'].str.contains("3|4|5"), 'Rhbdf2_box_groups'] = "3+"

count_df['Rhbdf2_present'] = count_df['Rhbdf2_counts'].apply(int).apply(str)
count_df.loc[count_df['Rhbdf2_present'].str.contains("1|2|3|4|5"), 'Rhbdf2_present'] = "1+"

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
plot_df = pd.melt(count_df, id_vars=['Olfr','Rhbdf2_present'], value_vars=y_column)

fig = px.box(plot_df, 
             x = 'variable', 
             y = 'value', 
             color = 'Rhbdf2_present',
             notched = True,
#              points='all',
             title = "Relative Expression of genes to presence of Rhbdf2"
            )
fig.update_layout(
        margin=dict(
            l=50, r=50, b=100, t=100,pad=10
        ),
        template='plotly_white'
    )
fig.show()
# fig.write_html('./output/plots/Rhbdf2_box.html')

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts']))
plot_df = pd.melt(count_df, id_vars=['Olfr','Rhbdf2_box_groups'], value_vars=y_column)

fig = px.box(plot_df, 
             x = 'variable', 
             y = 'value', 
             color = 'Rhbdf2_box_groups',
             notched = True,
#              points='all',
             title = "Relative Expression of genes to Rhbdf2"
            )
fig.update_layout(
        margin=dict(
            l=50, r=50, b=100, t=100,pad=10
        ),
        template='plotly_white'
    )
fig.show()
fig.write_html('./output/plots/Rhbdf2_grouped_box.html')

Transforms count dataframe into log data frame and save. 

In [None]:
# Log transform counts

log1p_df = pd.DataFrame(count_df[['Olfr', 'Rhbdf2_box_groups', 'Rhbdf2_present']])

for i in list(count_df.columns[count_df.columns.str.contains('_counts')].drop(['Rhbdf2_counts'])):
    log1p_df[i.split('_')[0]+'_log1p'] = np.log1p(count_df[i])
# Replace -inf values to 0
# count_df.loc[count_df['log(S100a5_counts)'] == -np.inf,'log(S100a5_counts)'] = 0

# Save count_df 
# log1p_df.to_csv('./Interested_gene_log1p.csv')

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')])
plot_df = pd.melt(log1p_df, id_vars=['Olfr','Rhbdf2_present'], value_vars=y_column)

fig = px.box(plot_df, 
             x = 'variable', 
             y = 'value', 
             color = 'Rhbdf2_present',
             notched = True,
#              points='all',
             animation_group='variable',
#              boxmode='group',
             title = "log1p Expression of genes to presence of Rhbdf2"
            )
fig.update_layout(
        margin=dict(
            l=50, r=50, b=100, t=100,pad=10
        ),
        template='plotly_white'
    )
fig.show()
fig.write_html('./output/plots/Rhbdf2_log1p_box.html')

In [None]:
 # Quick plot to see how the other interested genes correslate with Rhbdf2
y_column = list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')])
plot_df = pd.melt(log1p_df, id_vars=['Olfr','Rhbdf2_box_groups'], value_vars=y_column)

fig = px.box(plot_df, 
             x = 'variable', 
             y = 'value', 
             color = 'Rhbdf2_box_groups',
             notched = True,
#              points='all',
             animation_group='variable',
#              boxmode='group',
             title = "log1p Expression of genes to Rhbdf2"
            )
fig.update_layout(
        margin=dict(
            l=50, r=50, b=100, t=100,pad=10
        ),
        template='plotly_white'
    )
fig.show()
fig.write_html('./output/plots/Rhbdf2_grouped_log1p_box.html')

In [None]:
for i in list(log1p_df.columns[log1p_df.columns.str.contains('_log1p')]):
    fig = px.box(log1p_df, 
                 x = 'Rhbdf2_box_groups', 
                 y = i, 
                 color = 'Rhbdf2_box_groups',
                 notched = True,
                 title = "log1p Expression of Rhbdf2 and " + i.split('_')[0]
                )
    fig = add_p_value_annotation(fig, [[0,1], [0,2], [0,3]])
#     fig = add_p_value_annotation(fig, [[0,1]])
    fig.update_layout(
        autosize=False,
        width=500,
        height=400,
        margin=dict(
            l=50,
            r=50,
            b=100,
            t=100,
            pad=10
        ),
    #     paper_bgcolor='rgba(0,0,0,0)',
    #     plot_bgcolor='rgba(0,0,0,0)',
        template='plotly_white'
    )
    fig.show()
    fig.write_html('./output/plots/Individual_box/Rhbdf2_grouped_' + i.split('_')[0] + '.html')

In [None]:
# Does t-test to see significance between box groups 

import itertools

box_groups = count_df['Rhbdf2_box_groups'].unique()
# Generate all the combinations of two groups
combinations = itertools.combinations(box_groups, 2)
# for combination in combinations:
#     print(combination)

# Initialise a list of combinations of groups that are significantly different
significant_combinations = []
for combination in combinations:
    data1 = count_df[count_df['Rhbdf2_box_groups'].isin([combination[0]])]['log1p(S100a5_counts)']
    data2 = count_df[count_df['Rhbdf2_box_groups'].isin([combination[1]])]['log1p(S100a5_counts)']
    # Significance
    U, p = stats.mannwhitneyu(data1, data2, alternative='two-sided')
    if p < 0.05:
        significant_combinations.append([combination, p])
        
significant_combinations

In [None]:
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
    
    Parameters:
    ----------
    fig: figure
        plotly boxplot figure
    array_columns: np.array
        array of which columns to compare 
        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
    subplot: None or int
        specifies if the figures has subplots and what subplot to add the notation to
    _format: dict
        format characteristics for the lines

    Returns:
    -------
    fig: figure
        figure with the added notation
    '''
    # Specify in what y_range to plot for each pair of columns
    y_range = np.zeros([len(array_columns), 2])
    for i in range(len(array_columns)):
        y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]

    # Get values from figure
    fig_dict = fig.to_dict()

    # Get indices if working with subplots
    if subplot:
        if subplot == 1:
            subplot_str = ''
        else:
            subplot_str =str(subplot)
        indices = [] #Change the box index to the indices of the data for that subplot
        for index, data in enumerate(fig_dict['data']):
            #print(index, data['xaxis'], 'x' + subplot_str)
            if data['xaxis'] == 'x' + subplot_str:
                indices = np.append(indices, index)
        indices = [int(i) for i in indices]
        print((indices))
    else:
        subplot_str = ''

    # Print the p-values
    for index, column_pair in enumerate(array_columns):
        if subplot:
            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
        else:
            data_pair = column_pair

        # Mare sure it is selecting the data and subplot you want
        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])

        # Get the p-value
        pvalue = stats.ttest_ind(
            fig_dict['data'][data_pair[0]]['y'],
            fig_dict['data'][data_pair[1]]['y'],
            equal_var=False,
        )[1]
        if pvalue >= 0.05:
            symbol = 'ns'
        elif pvalue >= 0.01: 
            symbol = '*'
        elif pvalue >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][0], 
            x1=column_pair[0], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Horizontal line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][1], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[1], y0=y_range[index][0], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        ## add text at the correct x, y coordinates
        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(column_pair[0] + column_pair[1])/2,
            y=y_range[index][1]*_format['text_height'],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x"+subplot_str,
            yref="y"+subplot_str+" domain"
        ))
    return fig