In [None]:
# %load 10_2022_load_config.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "10_2022_analysis.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
root = Path(configs['root'])
scratchDir = Path(configs['scratchDir'])
figuresDir = Path(configs['figuresDir'])

alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

# Calculate correlations for each/some samples

1. Load sample data
2. Choose samples to analyze (by dnaid)
    - Choosing dnaid2023 and dnaid2016
3. Load counts for that dnaid
4. Load control file
5. Calculate correlation

6. At the same time run mbarq for that dnaid. 

In [None]:
sampleData = pd.read_csv(root/configs['sampleDataFile'])

In [None]:
sampleData.groupby('dnaid').sampleID.nunique()

In [None]:
controls = pd.read_csv(root/configs['controlFile'], header=None, names=['barcode', 'conc', 'genotype'])

In [None]:
#controls[['barcode', 'conc', 'genotype']].to_csv(root/configs['controlFile'], header=None, index=False)

In [None]:
controls = controls[controls.genotype == 'wt']

In [None]:
controls

In [None]:
def calc_corr(cdf, controls):
    cdf = cdf.set_index('barcode').drop('Name', axis=1)
    cdf = np.log2(cdf/cdf.sum()* 1000000 + 0.5).reset_index()
    cdf = controls.merge(cdf, how='left', on='barcode')

    cdf['conc'] = np.log2(cdf['conc'])
    cdf = pd.DataFrame(cdf.corr()['conc'].iloc[1:])

    cdf.columns = ['R']
    cdf['R2'] = cdf.R**2
    return cdf

In [None]:
cdf23 = pd.read_csv(root/configs['countsDir']/'dnaid2023_mbarq_merged_counts.csv')
cdf16 = pd.read_csv(root/configs['countsDir']/'dnaid2016_mbarq_merged_counts.csv')



In [None]:
cdf23_cor = calc_corr(cdf23, controls)
cdf16_cor = calc_corr(cdf16, controls)

In [None]:
cdf16_cor

In [None]:
mcor23 = pd.read_csv(root/'tmp/dnaid2023_mbarq_merged_counts.correlations.csv', index_col=0)
mcor16 = pd.read_csv(root/'tmp/dnaid2016_mbarq_merged_counts.correlations.csv', index_col=0)

In [None]:
mcor16

In [None]:
df23 = mcor23.merge(cdf23_cor, left_index=True, right_index=True)
df16 = mcor16.merge(cdf16_cor, left_index=True, right_index=True)

In [None]:
px.scatter(df23, x='R_x', y='R_y')

In [None]:
px.scatter(df16, x='R_x', y='R_y')

In [None]:
# Everything looks good