In [None]:
# %load load_manuscript_data.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import plotly.io as pio
import yaml

sns.set_context("notebook", font_scale=1.4)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


config_file = "manuscript_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
run_on = "server"
root = Path(configs['root'][run_on])
scratchDir = Path(configs['scratchDir'][run_on])
figuresDir = Path(configs['figuresDir'][run_on])

alphabetClrs = px.colors.qualitative.Alphabet
clrs = ["#f7ba65", "#bf4713", "#9c002f", "#d73d00", "#008080", "#004c4c"]
colors = {'grey': alphabetClrs[8], 
        'light_yellow': clrs[0],
        'darko': clrs[1],
        'maroon':clrs[2],
        'brighto': clrs[3],
        'teal':clrs[4],
        'darkteal':clrs[5]
       }

sushi_colors = {'red': '#C0504D',
             'orange': '#F79646',
             'medSea': '#4BACC6', 
             'black': '#000000',
             'dgreen': '#00B04E',
             'lgreen': '#92D050',
             'dblue': '#366092',
             'lblue': '#95B3D7'}

In [None]:
nguyenConfig = configs['nguyen']
countsFile = root/nguyenConfig['countsFile']
resultsFile = root/nguyenConfig['resultsFile']
#resultsFileName = root/nguyenConfig['resultsFileName']
sampleDataFile = root/nguyenConfig['sampleDataFile']
publishedResultsFile = root/nguyenConfig['publishedResultsFile']
publishedPhenotypesFile = root/nguyenConfig['publishedPhenotypesFile']
mapFile = root/nguyenConfig['mapFile']['filtered']

In [None]:
resultsFile

In [None]:
%ls /nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/lifeer/14_02_23_rerunning_mbarq/data/results

In [None]:
sample_data = pd.read_csv(sampleDataFile)
samples = list(sample_data.sampleID.values)

In [None]:
controls = pd.read_csv("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/lifeer/14_02_23_rerunning_mbarq/data/results/nguyen_2020_l50_mbarq_merged_counts_control_counts.csv",
                      index_col=0)

In [None]:
controls  = controls[['barcode', 'concentration']+samples]

In [None]:
controls

In [None]:
correlations = pd.read_csv("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/lifeer/14_02_23_rerunning_mbarq/data/results/nguyen_2020_l50_mbarq_merged_counts_correlations.csv",
                          index_col=0)

In [None]:
 correlations = correlations.loc[samples]

In [None]:
sample_data

In [None]:
def plot_cor(sample):
    t = " ".join(list(sample_data[sample_data.sampleID==sample].values[0]))
    fig = px.scatter(x=np.log2(controls['concentration']), y=controls[sample],
                     trendline='ols', title=t, width=400, height=400)
    return fig

In [None]:
import plotly.offline as ofl

In [None]:
samples = [c for c in controls.columns if 'dnaid' in c]

In [None]:
samples

In [None]:
 filter_one = [s for s in samples if (controls[s]==-1).sum()/controls.shape[0] <= 0.2]


In [None]:
len(filter_one)

In [None]:
egs =['dnaid1315_10', 'dnaid1315_107', 'dnaid1315_117', 'dnaid1315_124', 'dnaid1315_128',
                             'dnaid1315_129', 'dnaid1315_131', 'dnaid1315_136', 'dnaid1315_17', 'dnaid1315_18',
                             'dnaid1315_19', 'dnaid1315_20', 'dnaid1315_28', 'dnaid1315_40', 'dnaid1315_42',
                             'dnaid1315_50', 'dnaid1315_52', 'dnaid1315_66', 'dnaid1315_81', 'dnaid1315_90',
                             'dnaid1315_92', 'dnaid1315_94']

In [None]:
len(egs)

In [None]:
for sample in filter_one:
    fig = plot_cor(sample)
    ofl.iplot(fig)

In [None]:
sample_data[sample_data.sampleID==samples[0]].values[0]

In [None]:
dnaid1315_53, dnaid1315_137, dnaid1315_139

In [None]:
correlations

In [None]:
sample_data

In [None]:
c_data = pd.read_csv(countsFile, low_memory=False)

In [None]:
c_data = c_data[['barcode', 'locus_tag', 'dnaid1315_96', 'dnaid1315_66']+'dnaid1315_10,dnaid1315_107,dnaid1315_81'.split(',')]

In [None]:
c_data = c_data.set_index(['barcode', 'locus_tag'])

In [None]:
c_data.loc[('TTTTTTTGTAACGCTGC', np.nan)]

In [None]:
hi = c_data[c_data.sum(axis=1) > 100]

In [None]:
c_data.loc[('ACTACAAGACTGGTTAA', np.nan)]

In [None]:
hi

In [None]:
hi.shape[0]

In [None]:
200/hi.shape[0]

In [None]:
hi[hi.dnaid1315_66 < 1]

In [None]:
c_data[c_data.dnaid1315_96 < 1]

In [None]:
x =0
if x:
    print('bla')
else:
    print('no bla')

In [None]:
results_filtered_100/nguyen_2020_l50_mbarq_merged_counts_d4_vs_d0.txt

In [None]:
df = pd.read_table(wdir/"nguyen_2020_l50_mbarq_merged_counts_d4_vs_d0.txt")

In [None]:
df.head()

In [None]:
controls

In [None]:
2/5

In [None]:
df[df.barcode.isin(controls.barcode.values)]

In [None]:
wdir = Path("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/lifeer/14_02_23_rerunning_mbarq/data/results_filtered_100")

In [None]:
df = pd.read_table(wdir/"nguyen_2020_l50_mbarq_merged_counts_d4_vs_d0.gene_summary.txt")

In [None]:
df.head()

In [None]:
df['neg|lfc'].hist(bins=100)