In [None]:
# %load 10_2022_load_config.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "10_2022_analysis.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
root = Path(configs['root'])
scratchDir = root/configs['scratchDir']
figuresDir = root/configs['figuresDir']
libraries = configs['libraries']


alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

In [None]:
mayResultsDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/04_22/results")

In [None]:
resultFiles = [f for f in (root/configs['resultsDir']).glob('*_rra_results.csv')]
mayFiles = [f for f in mayResultsDir.glob('*_rra_results.csv')]

In [None]:
fdf = pd.concat([pd.read_csv(f).assign(library = f.stem.split('_rra')[0]) for f in resultFiles])
mayDf = pd.concat([pd.read_csv(f, index_col=0).assign(library = f.stem.split('_rra')[0]) for f in mayFiles])

In [None]:
fdf.sample(5)

In [None]:
mayDf.sample(5)

In [None]:
df1 = fdf[fdf.contrast == 'd1']
fig = px.histogram(df1, x='LFC', facet_col='library', color='library', facet_col_wrap=2, height=1000, width=800)
fig.add_vline(x=0)
fig 

In [None]:
df1 = mayDf[mayDf.contrast == 'd1']
fig = px.histogram(df1, x='LFC', facet_col='library', color='library', facet_col_wrap=2, height=1000, width=800)
fig.add_vline(x=0)
fig 

In [None]:
res = (df1[(abs(df1.LFC) > 1) & (df1.neg_selection_fdr < 0.05)]
       .groupby('Name')
       .agg({'library':['count'], 'LFC':['median', 'mean', 'std']})
       .reset_index()
      .sort_values(('library', 'count'), ascending=False))

In [None]:
res[res[('library', 'count')] > 3]

# Comparing control vs median normalization

In [None]:
normDir = root/'median_norm_analysis'
normResultFiles = [f for f in normDir.glob('*_rra_results.csv')]
contDir = root/'control_norm_analysis'
contResultFiles = [f for f in contDir.glob('*_rra_results.csv')]

In [None]:
df_list = []
for f in normResultFiles:
    df = pd.read_csv(f).assign(library = f.stem.split('_rra')[0])
    df_list.append(df)
norm_df = pd.concat(df_list)

In [None]:
df_list = []
for f in contResultFiles:
    df = pd.read_csv(f).assign(library = f.stem.split('_rra')[0])
    df_list.append(df)
cont_df = pd.concat(df_list)

In [None]:
fig = px.histogram(norm_df[norm_df.contrast == 'd1'], x='LFC', facet_col='library', color='contrast', 
                   facet_col_wrap=2, height=1200, width=1000)
fig.add_vline(x=0)
fig 

In [None]:
fig = px.histogram(cont_df, x='LFC', facet_col='library', color='contrast', 
                   facet_col_wrap=2, height=1200, width=1000)
fig.add_vline(x=0)
fig 

In [None]:
def sig_results(df, th=1):
    return df[(abs(df.LFC) > th) &(((df.neg_selection_fdr < 0.01))|((df.pos_selection_fdr < 0.01)))]

In [None]:
norm_res = sig_results(norm_df)
cont_res = sig_results(cont_df)
may_res = sig_results(mayDf)

In [None]:
cont_df.sort_values('neg_selection_fdr').head(10)

In [None]:
norm_df.sort_values('neg_selection_fdr').head(200).tail()

In [None]:
may_res.shape

In [None]:
cont_res.shape

In [None]:
import random

In [None]:
res = (df1[(abs(df1.LFC) > 1) & (df1.neg_selection_fdr < 0.05)]
       .groupby('Name')
       .agg({'library':['count'], 'LFC':['median', 'mean', 'std']})
       .reset_index()
      .sort_values(('library', 'count'), ascending=False))

In [None]:
set1 = set(cont_res[~(cont_res.Name.str.contains(":")) & (cont_res.LFC>1) & (cont_res.contrast == 'd4')].Name.unique())

In [None]:
set2 = set(may_res[~(may_res.Name.str.contains(":")) & (may_res.LFC>1) & (may_res.contrast == 'd2')].Name.unique())

In [None]:
set1 - set2

In [None]:
v1 = 'stbA'
v2 = 'stbB'
df1 = mayDf[mayDf.Name == v1][['Name', 'LFC', 'contrast', 'library']]
df2 = mayDf[mayDf.Name == v2][['Name', 'LFC', 'contrast', 'library']]
test = pd.concat([df1, df2])
test = test.pivot(index=['contrast', 'library'], columns='Name', values='LFC').reset_index()
px.scatter(test, x=v1, y = v2, trendline='ols', hover_data=['library', 'contrast'])

In [None]:
num_days = cont_df[~cont_df.Name.str.contains(':')].groupby('Name').contrast.count().reset_index()

In [None]:
px.histogram(num_days, x='contrast')

In [None]:
df1 = cont_df[cont_df.Name == v1][['Name', 'LFC', 'contrast', 'library']]
df2 = cont_df[cont_df.Name == v2][['Name', 'LFC', 'contrast', 'library']]
test = pd.concat([df1, df2])
test = test.pivot(index=['contrast', 'library'], columns='Name', values='LFC').reset_index()
px.scatter(test, x=v1, y = v2, trendline='ols', hover_data=['library', 'contrast'])

# Calculating pairwise correlations between genes

In [None]:
test = cont_df[['Name','LFC', 'contrast', 'library']][~cont_df.Name.str.contains(':')].copy()
test = test[test.contrast.isin(['d1'])]
test=(test
      .pivot(index=['contrast', 'library'], columns='Name', values='LFC')
      
      )
test['day'] = [int(d[0].strip('d')) for d in test.index]
test_corr = test.corr(min_periods=8)
test_corr.index.name = 'gene'
test_corr = test_corr.unstack().reset_index().dropna()

In [None]:
test_corr[ test_corr[0] > 0.9].sample(20)

In [None]:
to_show = test_corr[(test_corr.Name == 'invA') & (abs(test_corr[0]) > 0.9)].gene.unique()

In [None]:
z = y.groupby('library').LFC.mean()

In [None]:
order = y.groupby('library').LFC.mean().sort_values().index.to_list()

In [None]:
y = cont_df[(cont_df.Name.isin(to_show)) & (cont_df.contrast == 'd1')]

px.scatter(y, x='library', y=2**y['LFC'], color='Name', category_orders= {'library': order})

In [None]:
cont_df.head()

In [None]:
x = cont_df[cont_df.contrast == 'd1'].copy()
x = x[~x.Name.str.contains(':')]
x=x.pivot(index='library', columns='Name', values='LFC').fillna(0)
x[x != 0] = 1
x = x.loc[:, x.sum().between(2,8)]
x = x.merge(z, left_index=True, right_index=True)

In [None]:
t = x['LFC']
x = x.drop('LFC', axis=1)

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score
regr = LinearRegression()
regr.fit(x, t)

In [None]:
max(regr.coef_)

In [None]:
min(regr.coef_)

In [None]:
diabetes_y_pred = regr.predict(x)
r2_score(t, diabetes_y_pred)

In [None]:
t

In [None]:
diabetes_y_pred

In [None]:
test_neg.groupby('Name').gene.count().sort_values()


In [None]:
g = 'envR'
test_corr[(test_corr.Name == g) & (test_corr.gene == 'day')]

In [None]:
test_corr[(test_corr.Name== g) & (abs(test_corr[0]) > 0.8)].sample(20)

In [None]:
v1 = 'invG'
v2 = 'invA'
df1 = cont_df[cont_df.Name == v1][['Name', 'LFC', 'contrast', 'library']]
df2 = cont_df[cont_df.Name == v2][['Name', 'LFC', 'contrast', 'library']]
test = pd.concat([df1, df2])
test = test.pivot(index=['contrast', 'library'], columns='Name', values='LFC').reset_index()
px.scatter(test, x=v1, y = v2, color='library', symbol='contrast', hover_data=['library', 'contrast'])

In [None]:
fdf = cont_df.merge(old_res, on=['Name', 'library', 'contrast'])

In [None]:
fdf['hits_x'] = (abs(fdf.LFC_x) > 1) & ((fdf.neg_selection_fdr_x < 0.01 )|(fdf.pos_selection_fdr_x < 0.01))

In [None]:
fdf['hits_y'] = (abs(fdf.LFC_y) > 1) & ((fdf.neg_selection_fdr_y < 0.01 )|(fdf.pos_selection_fdr_y < 0.01))

In [None]:
px.scatter(fdf, x='LFC_x', y='LFC_y', color='contrast', hover_data=['Name', 'library'], facet_col='library', 
          facet_col_wrap=2, height=1600, width=1000)

In [None]:
cont_df.Name.nunique()

In [None]:
fdf.groupby(['library', 'contrast']).agg({'hits_x': ['sum'], 'hits_y': ['sum']})

In [None]:
fdf[(fdf.library == 'library_12_1') & (fdf.hits_x ==True) &(fdf.hits_y != True)].sample(50)

In [None]:
fdf.LFC_y.skew()

In [None]:
cont_df

In [None]:
cont_df['LFC_ajd'] = cont_df.groupby(['library','contrast']).LFC.transform(lambda x: (x - x.mean())/x.std())

In [None]:
cont_df[(cont_df.library == 'library_10_1') & (cont_df.contrast == 'd2')].LFC.median()

In [None]:
cont_df[(cont_df.library == 'library_10_1') & (cont_df.contrast == 'd3')].LFC.hist(bins=100)
cont_df[(cont_df.library == 'library_10_1') & (cont_df.contrast == 'd3')].LFC_ajd.hist(bins=100)

In [None]:
cont_df[['LFC', 'LFC_ajd']]

In [None]:
fig = px.histogram(cont_df, x='LFC_ajd', facet_col='library', color='contrast', 
                   facet_col_wrap=2, height=1200, width=1000)
fig.add_vline(x=0)
fig 