### The goal of this notebook is to compare the different classes of mOSN markers. Presumably the markers that are activated at later stage of the cell lineage may be regulated or participating in different pathways in comparison to late activated genes. 

In [3]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go 

from importlib import reload
import GE_functions
reload(GE_functions)

<module 'GE_functions' from '/data/jlu/Chaperone_Analysis/notebooks/GE_functions.py'>

In [4]:
mOSN_rankgenegroup = pd.read_csv('../output/brann/mOSN_rankgenegroup.csv', index_col=0)
mOSN_rankgenegroup.sort_values(['mOSN_scores','mOSN_logfoldchanges'], 
                               ascending=False, inplace=True)

In [5]:
ge_normalized = pd.read_csv('../expression_csv/ge_normalized_GSE151346_MOE_ALL_OlfrSum.csv', 
                            index_col=0)

In [35]:

top_mOSNmarker = mOSN_rankgenegroup.loc[0:100].mOSN_names.to_list()
top_mOSNmarker.remove('Olfr_sum')
plot_genes = np.intersect1d(ge_normalized.columns, 
                            top_mOSNmarker).tolist()


fig = go.Figure()
ge_data = GE_functions.get_ge_data(ge_normalized, plot_genes)

for g in ge_data['gene'].unique():
    fig.add_trace(go.Scatter(x= ge_data[ge_data['gene'] == g]['index'], 
                             y= ge_data[ge_data['gene'] == g]['expression'],
                            opacity = 0.3,
                            mode ='lines',
                            name = g,
                            line=dict(
                                color='grey',
                                width=8,
#                                 dash = 'dot'
                            )
                        ))

# Add bolded lines for specific genes 
ge_data = GE_functions.get_ge_data(ge_normalized, ['Rtp1'])
for g in ge_data['gene'].unique():
    fig.add_trace(go.Scatter(x= ge_data[ge_data['gene'] == g]['index'], 
                             y= ge_data[ge_data['gene'] == g]['expression'],
                             mode = 'lines',
                             name = g, 
                             line=dict(
                                color='black',
                                dash = 'dot',
                                width=8)
    ))
    
fig.update_layout(xaxis_type = 'category',template='simple_white', # update x-axis to category so that it doesn't sort the numbers
                 title='mOSN markers<br>\
                  <span style="font-size: 12px;">genes are identified from top 50 genes <br>positively expressed in mOSN cluster ranked against other cell types in the neuronal lineage</span><br>\
                  <span style="font-size: 10px;">Data from Brann et al. scRNAseq WOM</span>',
                 font=dict(
        size=10,
    ))
fig.show()

fig.write_html("../output/mOSNmarker_analysis/ge_line_mOSNmarker2.html")


ge_data constructed
ge_data constructed


### Give genes a variable value for the onset time. Using area under the curve (AUC) as metric. Presumably the later the onset the lower AUC will be across OSN lineage. 

deprecated: Define genes that are late onest as genes lower normalized activation than Omp at 0.8-0.95 bins. <br>(Omp considered late)

In [15]:
top_mOSNmarker = mOSN_rankgenegroup.loc[0:100].mOSN_names.to_list()
# filter for 'mt-' genes that are not in our ge_normalized data 
top_mOSNmarker = [i for i in top_mOSNmarker if 'mt-' not in i]
# top_mOSNmarker.insert(0,'index')

In [7]:
"""
DEPRECATED
Classifying late onset vs early onset genes via sum in between 0.8 and 0.95
"""
# temp = top_mOSNmarker
# temp = temp.insert(0,'index')
# temp = ge_normalized[top_mOSNmarker].set_index('index')
# late_marker = []
# early_marker = []
# for gene in temp:
#     if temp[gene][4:6].sum() > temp['Omp'][4:6].sum():
#         early_marker.append(gene)
#     else: 
#         late_marker.append(gene)


'\nDEPRECATED\nClassifying late onset vs early onset genes via sum in between 0.8 and 0.95\n'

In [16]:
"""
Manually create a colormap for plotting in plotly line
"""
# get a list of expression AUC sum for each genes in a list 
top_mOSNmarker_AUC = []
for i in GE_functions.get_ge_data(ge_normalized, top_mOSNmarker)['gene'].unique():
    top_mOSNmarker_AUC.append(ge_normalized[i].sum())
# normalize the AUC values to between 0 and 1. 
top_mOSNmarker_AUC = GE_functions.reverse_scale(top_mOSNmarker_AUC)
top_mOSNmarker_AUC_color = GE_functions.get_color('Rdbu', top_mOSNmarker_AUC)
# Create a colormap dictionary holding gene information with corresponding color 
color_map = {}
for i in enumerate(top_mOSNmarker):
    color_map[i[1]] = [top_mOSNmarker_AUC[i[0]],top_mOSNmarker_AUC_color[i[0]]]

ge_data constructed


In [22]:
fig = go.Figure()

ge_data = GE_functions.get_ge_data(ge_normalized, top_mOSNmarker)
for g in ge_data['gene'].unique():
    plot_data = ge_data[ge_data['gene'] == g]    
    fig.add_trace(go.Scatter(x= plot_data['index'], 
                             y= plot_data['expression'],
                             opacity = 0.5,
                             mode ='lines',
                             name = g,
                             line=dict(
                                color = color_map[g][1],
                                width=8
#                                 dash = 'dot'
                            )
                        ))

# Add bolded lines for specific genes 
ge_data = GE_functions.get_ge_data(ge_normalized, ['Rtp1'])
for g in ge_data['gene'].unique():
    plot_data = ge_data[ge_data['gene'] == g]
    fig.add_trace(go.Scatter(x= plot_data['index'], 
                             y= plot_data['expression'],
                             mode = 'lines',
                             name = g, 
                             line=dict(
                                color='black',
                                 dash='dot',
                                width=8)
    ))
    
fig.update_layout(xaxis_type = 'category',template='simple_white', # update x-axis to category so that it doesn't sort the numbers
                 title='mOSN markers colored by expression onset time<br>\
    <span style="font-size: 10px;">Data from Brann et al. scRNAseq WOM</span>',
                 font=dict(
        size=20,
    ))
fig.show()

fig.write_html("../output/mOSNmarker_analysis/ge_line_mOSNmarker_labeled-Rdbu2.html")

ge_data constructed
ge_data constructed


### plot to visualize if there's a bias of abundance of expression and latency 

In [10]:
"""
DEPRECATED: 
Usage of Early/Late marker is replaced with normalized expression AUC instead 
"""
# early_marker_df = mOSN_rankgenegroup[mOSN_rankgenegroup['mOSN_names'].isin(early_marker)][['mOSN_names','mOSN_logfoldchanges', 'mOSN_scores']]
# late_marker_df = mOSN_rankgenegroup[mOSN_rankgenegroup['mOSN_names'].isin(late_marker)][['mOSN_names','mOSN_logfoldchanges', 'mOSN_scores']]
# early_marker_df['expression'] = 'early'
# late_marker_df['expression'] = 'late'
# marker_df = pd.concat([early_marker_df, late_marker_df])

'\nDEPRECATED: \nUsage of Early/Late marker is replaced with normalized expression AUC instead \n'

In [10]:
"""
Creates a marker_df with scores and logfoldchange information for the marker genes 
normalized_AUC information is also added from previously defined color_map 
"""
marker_df = mOSN_rankgenegroup[mOSN_rankgenegroup['mOSN_names'].isin(top_mOSNmarker)][['mOSN_names','mOSN_logfoldchanges', 'mOSN_scores']]
for gene in color_map: 
    marker_df.loc[marker_df['mOSN_names'] == gene, ['normalized_AUC', 'color']] = color_map[gene]

In [35]:
fig = go.Figure()
fig.add_trace(go.Scatter(
                     x = marker_df['normalized_AUC'], 
                     y = np.log(marker_df['mOSN_logfoldchanges']), 
                     mode = 'markers',
                     text = marker_df['mOSN_names'],
                     marker = dict(
                         size = 15,
                         color = marker_df['color']
#                          color = marker_df['mOSN_scores'],
#                          colorscale = 'Viridis',
#                          title = 'RankGeneGroup Score',
#                          showscale = True
                              ),
                     hovertemplate = "%{text} <br>LogFC: %{y} \
                                 </br>Score: %{marker.color}",
#                     showlegend=False
                    )
             )

# Calculate Pearson Correlation 
x = list(marker_df['normalized_AUC'])
y = list(np.log(marker_df['mOSN_logfoldchanges']))
r, r_p = pearsonr(x, y)

# Add a line for the correlation coefficient
fig.add_trace(go.Scatter(x=[min(x), max(x)], y=[r*min(y), r*max(y)],
                         mode='lines', 
                         line = dict(
                             dash='dot',
                             width = 5,
                             color = 'rgba(0, 0, 0, 0.5)'
                         ),
                         name='Pearson correlation: {} <br>Pearson p-value: {}'.format(round(r,3), 
                                                                                round(r_p, 5)),
#                          showlegend=False
                        )
             )



fig.update_layout(
        template='simple_white', # update x-axis to category so that it doesn't sort the numbers
        title="Genes with late expression are more enriched in mOSNs <br>\
    <span style='font-size: 10px;'> pearson r: " + str(np.round(r, 3)) + "</span>",
        yaxis = {'title' : 'Log( Enrichness )'},
        xaxis = {'showticklabels': True, 
#                  'title' :'<b>Early onset mOSN markers \
#                      Late onset mOSN markers</b>'},
                 'title' : 'Normalized gene expression AUC'},
        font=dict(
            size=15,
    )
)
fig.show()

# fig.write_html("../output/mOSNmarker_analysis/mOSNmarker_logfc_score-Rdbu.html")

### Now that we seperated mOSN markers into early and late, expore the bins to see if they posses different signature in different datasets: 
#### - covid RNAseq DE volcano 
#### - GO analysis 
#### - Lomvardas cell differently stressed express these differently ? 

### Zazhytska

In [27]:
hOE_DE = pd.read_csv('../files/Zazhytska/hOE_deseq2.csv', index_col=0)

In [28]:

plot_df = hOE_DE.copy()

fig = go.Figure()
# plot padj > 0.05
temp = plot_df
fig.add_trace(go.Scatter(x=temp['log2FoldChange'], 
                        y=-np.log10(temp['padj']),
                        text=temp.index,
                        mode='markers', 
                        name='padj > 0.05',
                        marker=dict(color = 'grey', opacity=0.3)))

# plot screening genes
temp = plot_df[list(plot_df.index.isin([i.upper() for i in marker_df['mOSN_names'].to_list()]))]
# Assign color_map to the marker temp dataframe 
for i in color_map.keys():
    temp.loc[temp.index == i.upper(), ['normalized_AUC','color']] = color_map[i]
temp.normalized_AUC = temp.normalized_AUC.astype('string')

fig.add_trace(go.Scatter(x=temp['log2FoldChange'], 
                        y=-np.log10(temp['padj']),
                        text=temp.index + '<br>Normalized onset: ' + temp.normalized_AUC,
                        textposition='top center',
#                         mode='markers+text', 
                        mode = 'markers',
                        name='mOSN markers',
                        marker=dict(color = temp.color,
                                    opacity= 0.5,
                                   size = 15)))


fig.update_traces( 
    textposition='top center',
    hovertemplate =
    '<b>%{text}</b>' + 
    '<br>LogFC: %{y}'+
    '<br>FDR: %{x}<br>')

fig.update_layout(
    xaxis_range = [-7, 7],

    title='mOSN markers are downregulated in covid infected OE<br>\
    <span style="font-size: 10px;">From Zazhytska et al. hOE_deseq2</span>',
    yaxis = {'title' : '-np.log10(padj)'},
    xaxis = {'title' : 'logfc'},
#     autosize=True,
#     width=800,
#     height=800,
    template='simple_white',
    font=dict(
        size=10,  # Set the font size here
    )
)
fig.show()
#
# fig.write_html("../output/mOSNmarker_analysis/Zazhytska_mOSNmarker-Rdbu.html")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
"""
Manually create a sizemap for plotting in plotly line
size map created similar as colormap. But instead of expression scaled between 0 and 1, 
pvalue is used to scale between 0 and 1. 
"""
top_mOSNmarker_pvalue_dict = {}
for i in hOE_DE.iloc[hOE_DE.index.isin([i.upper() for i in top_mOSNmarker])].index:
    top_mOSNmarker_pvalue_dict[i] = hOE_DE.iloc[hOE_DE.index == i].padj.item()

# Scale between 0 and 4 for better visualization.
top_mOSNmarker_pvalue = [pvalue + 4 for pvalue in GE_functions.reverse_scale(list(top_mOSNmarker_pvalue_dict.values()), 
                                                                             factor = 4)]

for i in enumerate(top_mOSNmarker_pvalue_dict.keys()):
    top_mOSNmarker_pvalue_dict[i[1]] = top_mOSNmarker_pvalue[i[0]]

In [36]:
"""
Plotting bar plot of mOSN markers across ER stress logfc to
better visualize the onset phenotype over stressed cells
"""
plot_df = hOE_DE.copy()
plot_df = plot_df[list(plot_df.index.isin([i.upper() for i in marker_df['mOSN_names'].to_list()]))]
# Assign color_map and pvalue_size to the marker plot_df dataframe 
for i in plot_df.index:
    plot_df.loc[plot_df.index == i, ['normalized_AUC','color']] = color_map[i.capitalize()]
    plot_df.loc[plot_df.index == i, ['pvalue_size']] = top_mOSNmarker_pvalue_dict[i]
# Manually adjust significant size to much bigger
plot_df.loc[plot_df.padj < 0.05, 'pvalue_size'] = 8
    
fig = go.Figure()
for gene in plot_df.index:
    fig.add_trace(go.Scatter(
                            x = plot_df.iloc[plot_df.index == gene]['log2FoldChange'], 
                            y = plot_df.iloc[plot_df.index == gene]['normalized_AUC'],
                            text = plot_df.iloc[plot_df.index == gene].index + '<br>padj: ' + 
                                        str(plot_df.iloc[plot_df.index == gene]['padj'].item()),
                            textposition='top center',
                            mode = 'markers',
                            name='early markers',
                            marker=dict(color = plot_df.iloc[plot_df.index == gene]['color'],
#                                         CHANGE *VALUE HERE TO ADJUST SIZE 
                                        size = plot_df.iloc[plot_df.index == gene]['pvalue_size']*5
                                       ),
                            showlegend = False
                            )
                 )
    
# Calculate Pearson Correlation 
temp = plot_df[plot_df.padj < 0.05]
x = temp['log2FoldChange'].values
y = temp['normalized_AUC'].values
r, r_p = pearsonr(x, y)

# fit trendline using linear regression
coeffs = np.polyfit(x, y, 1)
# x coordinates can be adjusted here to elongate or shorten the line
trendline_x = np.array([
#                         x.min(), 
#                         x.max()
                        -6,
                        2
                       ])
trendline_y = coeffs[0] * trendline_x + coeffs[1]
# fit a line to the data using the slope-intercept formula
m = r * (np.std(y) / np.std(x))
b = np.mean(y) - m * np.mean(x)
# Add a line for the correlation coefficient
fig.add_trace(go.Scatter(x = trendline_x,
                         y = trendline_y,
                         mode='lines', 
                         line = dict(
                             dash='dot',
                             width = 5,
                             color = 'rgba(0, 0, 0, 0.3)'
                         ),
                         name='Pearson correlation: {} <br>Pearson p-value: {}'.format(round(r,3), 
                                                                                round(r_p, 5)),
#                          showlegend=False
                        )
             )    
    

fig.update_traces( 
    textposition='top center',
    hovertemplate =
    '<b>%{text}</b>' + 
    '<br>LogFC: %{y}'+
    '<br>FDR: %{x}<br>')

fig.update_layout(
    xaxis_range = [-7, 7],
    yaxis = {'title' : 'Normalized gene AUC'},
    xaxis = {'title' : 'logfc'},
    title='mOSN markers are downregulated in covid infected OE<br>\
    <span style="font-size: 10px;">From Zazhytska et al. hOE_deseq2</span>',
#     autosize=True,
#     width=800,
#     height=800,
    template='simple_white',
    font=dict(
        size=10,  # Set the font size here
    )
)
fig.show()
# fig.write_html("../output/mOSNmarker_analysis/Zazhytska_mOSNmarker_onsetgraph-Rdbu.html")

See how the early vs late mOSN markers differ in GO analysis 

In [33]:
# Import GO_tools and initialize GO functions. May take a few seconds. 
import GO_tools
reload(GO_tools)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2023-04-01) 46,575 Terms
HMS:0:00:04.650689 432,155 annotations, 29,795 genes, 19,200 GOs, 1 taxids READ: gene2go 

Load BP Ontology Enrichment Analysis ...
 63% 17,666 of 28,212 population items found in association

Load CC Ontology Enrichment Analysis ...
 66% 18,525 of 28,212 population items found in association

Load MF Ontology Enrichment Analysis ...
 61% 17,094 of 28,212 population items found in association
  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2023-04-01) 46,575 Terms
HMS:0:00:04.545742 432,155 annotations, 29,795 genes, 19,200 GOs, 1 taxids READ: gene2go 

Load BP Ontology Enrichment Analysis ...
 63% 17,666 of 28,212 population items found in association

Load CC Ontology Enrichment Analysis ...
 66% 18,525 of 28,212 population items found in association

Load MF Ontology Enrichment Analysis ...
 61% 17,094 of 28,212 population items found in association


<module 'GO_tools' from '/data/jlu/Chaperone_Analysis/notebooks/GO_tools.py'>

In [12]:
"""
Conduct GO analysis on all the columns in GeneSets from different anlaysis
into the df dictionary 
"""
# Quickly bin early and late markers based on normalized auc 
early_markers = []
late_markers = []
for i in color_map.keys():
    if color_map[i][0] > 0.5: 
        early_markers.append(i)
    else: 
        late_markers.append(i)

df = {}
df['top_mOSNmarker'] = GO_tools.go_it(top_mOSNmarker)
df['top_mOSNmarker']['n_genes/n_go'] = df['top_mOSNmarker'].n_genes/df['top_mOSNmarker'].n_go
df['top_mOSNmarker']['n_genes/n_study'] = df['top_mOSNmarker'].n_genes/df['top_mOSNmarker'].n_study

input genes: 100
mapped genes: 98

Runing BP Ontology Analysis: current study set of 98 IDs.
 86%     84 of     98 study items found in association
100%     98 of     98 study items found in population(28212)
Calculating 12,720 uncorrected p-values using fisher_scipy_stats
  12,720 terms are associated with 17,666 of 28,212 population items
     647 terms are associated with     84 of     98 study items
  METHOD fdr_bh:
       2 GO terms found significant (< 0.05=alpha) (  2 enriched +   0 purified): statsmodels fdr_bh
       7 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)

Runing CC Ontology Analysis: current study set of 98 IDs.
 94%     92 of     98 study items found in association
100%     98 of     98 study items found in population(28212)
Calculating 1,805 uncorrected p-values using fisher_scipy_stats
   1,805 terms are associated with 18,525 of 28,212 population items
     183 terms are associated wit

In [154]:

# import plotly.express as px

# for i in df.keys():
#     fig = px.bar(df[i], 
# #                  x='n_genes/n_go',
# #                  x='n_genes/n_study', 
#                  x = 'n_genes/n_go',
#                  y='term', orientation='h',
#                 hover_data=['n_genes'],
#                 title = i).update_layout(
# #                                 template='plotly_dark',
#                                 plot_bgcolor='rgba(0, 0, 0, 0)',
# #                                 paper_bgcolor='rgba(0, 0, 0, 0)',
#                             )
#     fig.show()
# #     fig.write_html("../output/mOSNmarker_analysis//GO_"+i+".html")

In [13]:
"""
Assign every go_term with an 'Average' expression AUC. 
Then the all the go_terms' average AUC are then scaled to color 
"""
go_df = df['top_mOSNmarker'].copy()

for i in go_df.index:
    temp = []
    for j in go_df.iloc[i]['study_genes']:
        temp.append(color_map[j][0])
    temp = sum(temp)/len(temp)
    go_df.loc[i,'avg_exp'] = temp
    
go_df = go_df.sort_values('avg_exp')
go_df['color'] = GE_functions.get_color('Rdbu', go_df['avg_exp'])


In [20]:
plot_df = go_df.copy()

# Create a figure and plot it
fig = go.Figure()

fig.add_trace(
    go.Bar(x = plot_df['n_genes/n_go'], 
           y = plot_df['term'], 
           orientation = 'h',
           text = plot_df['n_genes'],
           textposition='outside',
#            hoverdata = plot_df['study_genes'],
           marker=dict(
               color=plot_df['color'])
          )
)
# update_layout(plot_bgcolor='rgba(0, 0, 0, 0)')

fig.update_traces( 
    hovertemplate =
    '<b>%{text}</b>' + 
    '<br>%{y}')

# # labeling the bar net worth
# annotations.append(dict(xref='x1', yref='y1',
#                         y=xd, x=yd + 3,
#                         text=str(yd) + '%',
#                         font=dict(family='Arial', size=12,
#                                   color='rgb(50, 171, 96)'),
#                         showarrow=False))

fig.update_layout(
    yaxis = {'title' : ''},
    xaxis = {'title' : 'n_genes/n_go'},
    title='mOSN markers are expression reveals differentiated GO terms<br>\
    <span style="font-size: 10px;"> </span>',
    autosize=True,
    width=1000,
    height=800,
    template='simple_white',
    font=dict(
        size=15,  # Set the font size here
    )
)
fig.show()
# fig.write_html("../output/mOSNmarker_analysis/GO_mOSNmarker_colorednumbered-Rdbu.html")

Create another graph focusing on early and late mOSN markers and plot in data points of Individual gene's expression AUC for visualization 

In [18]:
plot_df = go_df.copy()

# filter the plot_df for desired go_terms
# plot_df = plot_df.query('avg_exp < 0.5 or avg_exp > 0.6')

# Filter by manually defined go_terms 
plotting_term = ['intracellular cAMP-activated cation channel activity',
                 'neuronal cell body',
                 'presynaptic membrane',
                 'cAMP binding',
                 'cilium',
                 'neuron projection',
                 'heterotrimeric G-protein complex',
                 'detection of calcium ion'
                ]
plot_df = plot_df[plot_df['term'].isin(plotting_term)]

# Construct bar dataframe 
bar_x = list(plot_df['n_genes/n_go'])
bar_y = list(plot_df['term'])
bar_color = list(plot_df['color'])
bar_text = list(plot_df['n_genes'])
bar_hover = list(plot_df['study_genes'])
# Construct scatter dataframe 
scatter_x = []
scatter_y = []
scatter_color = []
scatter_hover = []
for go_term in plot_df.term:
    for gene in plot_df[plot_df.term == go_term].study_genes.item():
        scatter_x.append(color_map[gene][0])
        scatter_y.append(go_term)
        scatter_color.append(color_map[gene][1])
        scatter_hover.append(gene)
        


layout = go.Layout(
    title='mOSN markers are expression reveals differentiated GO terms<br>\
    <span style="font-size: 10px;"> </span>',
#     xaxis=dict(title='Expression AUC'),
#     xaxis2=dict(title='n_genes/n_go', overlaying='x', side='top'),
    xaxis=dict(title='n_genes/n_go'),
    xaxis2=dict(title='', overlaying='x', side='top'),
    yaxis=dict(autorange="reversed"),
    template='simple_white',
    bargap=0.3,
#     autosize=False,
#     width=800,
#     height=400
    font=dict(
        size=15,  # Set the font size here
    )
)

# Create the figure object and add the traces
fig = go.Figure(layout=layout)

# Create a scatter plot with a different x-axis
fig.add_trace(go.Scatter(
                    x=scatter_x,
                    y=scatter_y,
                    mode='markers',
                    hovertext = scatter_hover,
                    marker = dict(
                        color = scatter_color, 
                        size = 15,
                        line=dict(width=1.5, color='rgb(0, 0, 0)')
#                         opacity = 0.5,
                    ),
                    showlegend=False,
                    xaxis = 'x2')
             )
# Create a horizontal bar chart
fig.add_trace(go.Bar(
                    x=bar_x,
                    y=bar_y,
                    text= bar_text,
                    hovertext=bar_hover,
                    textposition='inside',
                    insidetextanchor="start",
                    insidetextfont=dict(family='Arial', size=15, color='black'),
                    orientation='h',
                    marker=dict(
                        color=bar_color
                          ),
                    showlegend=False)
             )


# Show the plot
fig.show()
        
# fig.write_html("../output/mOSNmarker_analysis/GO_mOSNmarker_AUCoverlayed-Rdbu.html")
fig.write_html("../output/mOSNmarker_analysis/GO_mOSNmarker_AUCoverlayed-Rdbu2.html")

### Shayya
Investigate eraly and late mOSN markers in differentially stressed cells. 

In [37]:
stress_DE = pd.read_csv('../files/Shayya/aracneviper/DE_scRNAseq_Bright_Dim_withinzone.tsv', sep='\t')
# stress_DE = pd.read_csv('../files/Shayya/scrna/DE_iRFPBright_vs_Dim.tsv', sep='\t')
stress_DE = stress_DE.set_index('gene_name')

In [38]:

plot_df = stress_DE.copy()

fig = go.Figure()
# plot FDR > 0.05
# temp = plot_df[plot_df['FDR'] > 0.05]
temp = plot_df
fig.add_trace(go.Scatter(x=temp['log2FoldChange'], 
                        y=-np.log10(temp['FDR']),
                        text=temp.index,
                        mode='markers', 
                        name='FDR > 0.05',
                        marker=dict(color = 'grey', opacity=0.3)))


temp = plot_df.loc[plot_df.index.isin(marker_df['mOSN_names'].tolist())]
# Assign color_map to the marker temp dataframe 
for i in color_map.keys():
    temp.loc[temp.index == i, ['normalized_AUC','color']] = color_map[i]
    
    
temp.normalized_AUC = temp.normalized_AUC.astype('string')
fig.add_trace(go.Scatter(x=temp['log2FoldChange'], 
                        y=-np.log10(temp['FDR']),
                        text=temp.index + '<br>Normalized onset: ' + temp.normalized_AUC,
                        textposition='top center',
#                         mode='markers+text', 
                        mode = 'markers',
                        name='mOSN markers',
                        marker=dict(color = temp.color,
                                    opacity = 0.5,
                                    size = 15)))



fig.update_traces( 
    textposition='top center',
    hovertemplate =
    '<b>%{text}</b>' + 
    '<br>LogFC: %{y}'+
    '<br>FDR: %{x}<br>')

fig.update_layout(
    xaxis_range = [-1.2, 1.2],
    title='mOSN markers are differentially expressed in cells with different stress <br>\
    <span style="font-size: 8px;">From Shayya et al. DE_scRNAseq_Bright_Dim_withinzone</span>',
    yaxis = {'title' : '-np.log10(FDR)'},
    xaxis = {'title' : 'logfc'},
#     autosize=True,
#     width=800,
#     height=800,
    template='simple_white',
    font=dict(
        size=10,  # Set the font size here
    )
)
fig.show()

# fig.write_html("../output/mOSNmarker_analysis/Shayya_Stressed_mOSNmarker.html")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [39]:
"""
Manually create a sizemap for plotting in plotly line
size map created similar as colormap. But instead of expression scaled between 0 and 1, 
pvalue is used to scale between 0 and 1. 
"""
top_mOSNmarker_pvalue_dict = {}
for i in stress_DE.iloc[stress_DE.index.isin([i for i in top_mOSNmarker])].index:
    top_mOSNmarker_pvalue_dict[i] = stress_DE.iloc[stress_DE.index == i].FDR.item()

# Scale between 0 and 4 for better visualization.
top_mOSNmarker_pvalue = [pvalue + 4 for pvalue in GE_functions.reverse_scale(list(top_mOSNmarker_pvalue_dict.values()), 
                                                                             factor = 4)]

for i in enumerate(top_mOSNmarker_pvalue_dict.keys()):
    top_mOSNmarker_pvalue_dict[i[1]] = top_mOSNmarker_pvalue[i[0]]

In [40]:
"""
Plotting bar plot of mOSN markers across ER stress logfc to
better visualize the onset phenotype over stressed cells
"""
plot_df = stress_DE.copy()

plot_df = plot_df.loc[plot_df.index.isin(marker_df['mOSN_names'].tolist())]
# Assign color_map to the marker plot_df dataframe 
for i in plot_df.index:
    plot_df.loc[plot_df.index == i, ['normalized_AUC','color']] = color_map[i]
    plot_df.loc[plot_df.index == i, ['FDR_size']] = top_mOSNmarker_pvalue_dict[i]
    
# Manually adjust significant size to much bigger
plot_df.loc[plot_df.FDR < 0.05, 'FDR_size'] = 8

    
fig = go.Figure()
for gene in plot_df.index:
    fig.add_trace(go.Scatter(
                            x = plot_df.iloc[plot_df.index == gene]['log2FoldChange'], 
                            y = plot_df.iloc[plot_df.index == gene]['normalized_AUC'],
                            text = plot_df.iloc[plot_df.index == gene].index + '<br>FDR: ' + 
                                        str(plot_df.iloc[plot_df.index == gene]['FDR'].item()),
                            textposition='top center',
                            mode = 'markers',
                            name='early markers',
                            marker=dict(color = plot_df.iloc[plot_df.index == gene]\
                                        ['color'],
                                        size = plot_df.iloc[plot_df.index == gene]\
#                                         CHANGE *VALUE HERE TO ADJUST SIZE 
                                        ['FDR_size']*5
                                       ),
                            showlegend = False
                            )
                 )

# Calculate Pearson Correlation 
temp = plot_df[plot_df.FDR < 0.05]
x = temp['log2FoldChange'].values
y = temp['normalized_AUC'].values
r, r_p = pearsonr(x, y)

# fit trendline using linear regression
coeffs = np.polyfit(x, y, 1)
# x coordinates can be adjusted here to elongate or shorten the line
trendline_x = np.array([x.min(), 
                        x.max()])
trendline_y = coeffs[0] * trendline_x + coeffs[1]
# fit a line to the data using the slope-intercept formula
m = r * (np.std(y) / np.std(x))
b = np.mean(y) - m * np.mean(x)
# Add a line for the correlation coefficient
fig.add_trace(go.Scatter(x = trendline_x,
                         y = trendline_y,
                         mode='lines', 
                         line = dict(
                             dash='dot',
                             width = 5,
                             color = 'rgba(0, 0, 0, 0.3)'
                         ),
                         name='Pearson correlation: {} <br>Pearson p-value: {}'.format(round(r,3), 
                                                                                round(r_p, 5)),
#                          showlegend=False
                        )
             )    
    

fig.update_traces( 
    textposition='top center',
    hovertemplate =
    '<b>%{text}</b>' + 
    '<br>normalized_AUC: %{y}'+
    '<br>LogFC: %{x}<br>')

fig.update_layout(
    xaxis_range = [-1.2, 1.2],
    yaxis = {'title' : 'Normalized gene onest'},
    xaxis = {'title' : 'logfc'},
    title='mOSN markers are differentially expressed in cells with different stress <br>\
    <span style="font-size: 8px;">From Shayya et al. DE_scRNAseq_Bright_Dim_withinzone</span>',
#     autosize=True,
#     width=800,
#     height=800,
    template='simple_white',
    font=dict(
        size=10,  # Set the font size here
    )
)
fig.show()
# fig.write_html("../output/mOSNmarker_analysis/Shayya_Stressed_mOSNmarker_onsetgraph.html")

Plot a pairwise pearson correlation to see if there's any relationship between genes

In [196]:
temp = ge_normalized.copy()
temp.pop('dpt_average')
temp = temp.set_index('index')

In [197]:
temp = temp[temp.columns.intersection(top_mOSNmarker)]
temp = temp.corr()

In [198]:
temp

Unnamed: 0,1700012B09Rik,Abhd16a,Acbd7,Acsl6,Adcy3,Adipor1,Ak1,Aldoa,Ano2,Aplp1,...,Tmbim6,Tmem64,Tmem74bos,Tspan7,Ttll6,Ttll7,Ubl3,Uckl1os,Umodl1,Olfr_sum
1700012B09Rik,1.000000,0.908564,0.852780,0.685139,0.996434,0.923994,0.858726,0.978941,0.965532,0.516503,...,0.634074,0.990275,0.822655,0.990673,0.897959,0.680482,0.941570,0.754605,0.957726,0.636773
Abhd16a,0.908564,1.000000,0.978865,0.885092,0.933389,0.984099,0.967997,0.964135,0.949417,0.756441,...,0.838102,0.930564,0.970562,0.952505,0.975125,0.839040,0.977609,0.944969,0.979101,0.831110
Acbd7,0.852780,0.978865,1.000000,0.937850,0.888473,0.981500,0.952799,0.918002,0.919588,0.830316,...,0.905808,0.893901,0.997029,0.912581,0.983691,0.900213,0.970886,0.976864,0.955345,0.901344
Acsl6,0.685139,0.885092,0.937850,1.000000,0.743618,0.898347,0.919842,0.783460,0.834989,0.963163,...,0.990335,0.737578,0.935039,0.766716,0.870000,0.971439,0.881452,0.979346,0.819161,0.989413
Adcy3,0.996434,0.933389,0.888473,0.743618,1.000000,0.950355,0.893665,0.987416,0.982840,0.586067,...,0.695951,0.992897,0.859823,0.996599,0.922015,0.738018,0.964697,0.803246,0.971215,0.699208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ttll7,0.680482,0.839040,0.900213,0.971439,0.738018,0.880718,0.913679,0.745296,0.825650,0.968785,...,0.953420,0.716774,0.884745,0.744875,0.852215,1.000000,0.850277,0.928997,0.776901,0.988310
Ubl3,0.941570,0.977609,0.970886,0.881452,0.964697,0.989446,0.942157,0.978560,0.981997,0.744009,...,0.851029,0.966505,0.955514,0.977049,0.966946,0.850277,1.000000,0.923120,0.987440,0.840440
Uckl1os,0.754605,0.944969,0.976864,0.979346,0.803246,0.943398,0.940895,0.847725,0.869114,0.900455,...,0.953200,0.808270,0.977765,0.832283,0.928184,0.928997,0.923120,1.000000,0.885870,0.944165
Umodl1,0.957726,0.979101,0.955345,0.819161,0.971215,0.973863,0.920465,0.989422,0.963196,0.661562,...,0.780777,0.975041,0.943943,0.986376,0.969687,0.776901,0.987440,0.885870,1.000000,0.765842


In [200]:
import plotly.express as px

fig = px.imshow(temp,
                color_continuous_scale='Rdbu')


# add labels and colorbar
fig.update_layout(title='Correlation Heatmap',
                  xaxis=dict(title='Features'),
                  yaxis=dict(title='Features'),
                  coloraxis_colorbar=dict(title='Correlation'))

# display plot
fig.show()

In [41]:
ge_data

Unnamed: 0,index,gene,expression
0,0-0.3,Rtp1,0.028835
1,0.3-0.6,Rtp1,0.063968
2,0.6-0.7,Rtp1,0.176199
3,0.7-0.8,Rtp1,0.420254
4,0.8-0.9,Rtp1,0.600026
5,0.9-0.95,Rtp1,0.856352
6,0.95-1,Rtp1,1.0


In [57]:
mse = pd.read_csv('../expression_csv/mse_normalized_rtp1_GSE151346_MOE_ALL_OlfrSum.csv', index_col=0)
top_100_genes = list(mse.transpose().sort_values(by='mse').index[0:100])


fig = go.Figure()


ge_data = GE_functions.get_ge_data(ge_normalized,top_100_genes)
for g in ge_data['gene'].unique():
    fig.add_trace(go.Scatter(x= ge_data[ge_data['gene'] == g]['index'], 
                             y= ge_data[ge_data['gene'] == g]['expression'],
                             mode = 'lines',
                             name = g, 
                             opacity = 0.2,
                             line=dict(
                                color='grey',
#                                 dash = 'dot',
                                width=5)
    ))
    

# Add bolded lines for specific genes 
ge_data = GE_functions.get_ge_data(ge_normalized, ['Omp', 'Gap43', 'Olfr_sum'])
for g in ge_data['gene'].unique():
    fig.add_trace(go.Scatter(x= ge_data[ge_data['gene'] == g]['index'], 
                             y= ge_data[ge_data['gene'] == g]['expression'],
                             mode = 'lines',
                             name = g, 
                             line=dict(
#                                 color='black',
#                                 dash = 'dot',
                                width=8)
    ))
    
ge_data = GE_functions.get_ge_data(ge_normalized, ['Rtp1'])
for g in ge_data['gene'].unique():
    fig.add_trace(go.Scatter(x= ge_data[ge_data['gene'] == g]['index'], 
                             y= ge_data[ge_data['gene'] == g]['expression'],
                             mode = 'lines',
                             name = g, 
                             line=dict(
                                color='black',
                                    dash = 'dot',
                                width=8)
    ))
    
fig.update_layout(xaxis_type = 'category',template='simple_white', # update x-axis to category so that it doesn't sort the numbers
                 font=dict(
        size=10,
    ))
fig.show()

fig.write_html("../output/mOSNmarker_analysis/ge_line_mOSNmarker3.html")


ge_data constructed
ge_data constructed
ge_data constructed
