In [None]:
# %load 10_2022_load_config.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "10_2022_analysis.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
root = Path(configs['root'])
scratchDir = root/configs['scratchDir']
figuresDir = root/configs['figuresDir']
libraries = configs['libraries']


alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

from sklearn.decomposition import PCA

In [None]:
libraries

In [None]:
# for each library make sure the columns in the merged count files == sampleIDs in the sample data file

In [None]:
countsDir = root/configs['countsDir']
#sampleDir = root/configs['sampleDataDir']
countFiles = [f for f in countsDir.glob("*merged_counts.csv")]

In [None]:
fsdf = pd.read_csv(root/configs['sampleDataFile'])

In [None]:
def check_count_merge(cnt_files, fsdf):
    for cnt_file in cnt_files:
        name = cnt_file.stem.split('_mbarq')[0]
        cdf = pd.read_csv(cnt_file, index_col=[0,1]).columns.to_list()
        sdf = fsdf[fsdf.library == name].sampleID.unique()
        all_samples = all([c in cdf for c in sdf])
        print(f"{name}: {all_samples}")
    print('All Done')

In [None]:
check_count_merge(countFiles, fsdf)

In [None]:
file = countFiles[0]
file

In [None]:
df_list = []
for file in countFiles:
    df = pd.read_csv(file).assign(library=file.stem.split("_mbarq")[0]).melt(id_vars=['barcode', 'Name', 'library'], 
                                                                        var_name='sampleID', value_name='mbarq_count')
    df['mbarq_count'] = np.log2((df['mbarq_count'] / df['mbarq_count'].sum()) * 1000000 + 0.5)
    df_list.append(df)
    
fdf = pd.concat(df_list)

In [None]:
controls = pd.read_csv(root/"sample_data"/configs['controlFile'], header=None)
controls.columns = ['barcode', 'conc', 'genotype']

In [None]:
controls = controls.merge(fdf, on='barcode', how='left')

In [None]:
controls = controls.merge(fsdf, on='sampleID', how='left', copy=False)

In [None]:
c1 = controls[controls.day == 'd1']

In [None]:
h1= c1[c1.genotype == 'hyb']

In [None]:
h1 = h1[h1.library_x =='library_10_1']

In [None]:
goi = 'hha'

test = fdf[(fdf.Name == goi) & (fdf.library == 'library_10_1')][['sampleID', 'mbarq_count']]
test.columns = ['sampleID', 'goi']

In [None]:
fdf.Name.unique()

In [None]:
h1 = h1.merge(test, on='sampleID', how='left')

In [None]:
h1.conc.unique()

In [None]:
h1 = h1[h1.conc > 1.5e-04]

In [None]:
h1

In [None]:
px.scatter(h1, x='mbarq_count', y='goi', trendline='ols')

In [None]:
h1.groupby('mouse').mbarq_count.median().reset_index().hist(bins=25)

In [None]:
fdf_gene = fdf.dropna(subset=['Name'])
fdf_gene = fdf_gene.groupby(['library', 'sampleID', 'Name']).mbarq_count.sum().reset_index()

In [None]:
fdf_gene = fdf_gene[~fdf_gene.Name.str.contains(":")]

In [None]:
fdf_gene.sample(10)

In [None]:
pca_dfs = []
for library in libraries:
    lib_df = fdf_gene[fdf_gene.library == library]
    pca = PCA(n_components=3)
    gene_pca = lib_df.pivot(index=['sampleID'], columns='Name', values='mbarq_count').fillna(0)
    gene_pca = np.log2((gene_pca / gene_pca.sum()) * 1000000 + 0.5)
    genes = gene_pca.var(axis=0).sort_values(ascending=False).head(100).index
    gene_pca = gene_pca[genes]
    pca_df = pca.fit_transform(gene_pca)
    pca_df = pd.DataFrame(pca_df, columns = ['pc1', 'pc2', 'pc3'], index=gene_pca.index)
    print(library)
    print(pca.explained_variance_ratio_)
    pca_dfs.append(pca_df)

In [None]:
pc = pd.concat(pca_dfs)
pc = pc.merge(fsdf, on='sampleID')

In [None]:
pc

In [None]:
px.scatter(pc[(pc.library != 'library_14_2') & (pc.library != 'library_10_2')], x='pc1', y='pc2', color='day', width=1000, height=1000)

In [None]:
pc_sum = pc.groupby(['library', 'day']).agg({'pc1':['median', 'std'], 
                                   'pc2':['mean', 'std'],
                                   'pc3':['mean', 'std']}).reset_index()

pc_sum.columns = ['library', 'day', 'pc1_mean', 'pc1_std', 'pc2_mean', 'pc2_std', 'pc3_mean', 'pc3_std']
pc_sum['day_num'] = pc_sum.day.replace({'d0':0, 'd1':1, 'd2':2, 'd3':3, 'd4': 4})

In [None]:
px.line(pc_sum.sort_values('day'), x='day', y='pc1_mean', color='library', template='plotly_white', width=400, height=400 )

In [None]:
px.box(pc_sum.sort_values('day'), x='day', y='pc1_mean', template='plotly_white', width=400, height=400 )

In [None]:
px.box(pc_sum.sort_values('day'), x='day', y='pc2_mean', template='plotly_white', width=400, height=400 )

In [None]:
px.line(pc1.sort_values('day'), x='day', y='pc1')