In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
dataDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_22/counts")
countData = pd.read_csv(dataDir/"dnaid1315_mbarq_merged_counts.csv")
sampleData = pd.read_csv(dataDir/"example_sample_data.csv")

In [None]:
countData.head()

In [None]:
sampleData.head()

In [None]:
countData[countData.Name == 'rfaI']

In [None]:
day1 = sampleData[sampleData.day.isin(['d1'])]
day1

In [None]:
day1Cnts = countData[['barcode', 'Name'] + list(day1.sampleID.values)].set_index(['barcode', 'Name'])
day1Cnts = day1Cnts[day1Cnts.sum(axis=1)>500]
day1Cnts = np.log2(day1Cnts/day1Cnts.sum()*1000000 + 0.5)

In [None]:
varBcs = day1Cnts.var(axis=1).sort_values().tail(100).reset_index().barcode.values

In [None]:
varBcs

In [None]:
df = day1Cnts.reset_index()
df = df[df.barcode.isin(varBcs)].drop(['Name'], axis=1).set_index('barcode').drop_duplicates()

In [None]:
df2 = df.T.corr()
df2.columns.name = 'barcode2'
df2 = df2.reset_index().melt(id_vars=['barcode'])
df2['r2'] = df2.value**2

In [None]:
df2 = df2[(abs(df2.r2) > 0.8) & (df2.r2 < 0.99)]

In [None]:
df2.sort_values('value').head(10)

In [None]:
plt.plot(df.loc['GCAAAAGGCCATAAATG'], df.loc['CGCAACGTACACAACGC'], '.')

In [None]:
countData[countData.barcode == 'AACATGTGAAACGAACA']

In [None]:
df3 = (df2.merge(countData[['barcode', 'Name']], on='barcode', how='left')
 .merge(countData[['barcode', 'Name']], left_on='barcode2', right_on='barcode', how='left'))[['barcode_x', 'barcode2', 'value', 'r2', 'Name_x', 'Name_y']]

In [None]:
df3[['Name_x', 'Name_y', 'value']].drop_duplicates().sort_values('value').dropna().head(30)

In [None]:
fdf = day1Cnts.reset_index()
fdf = fdf[fdf.Name.isin(['pilR', 'pilP'])]
fdf = fdf.melt(id_vars=['barcode', 'Name'], var_name='sampleID', value_name='log2CPM')

In [None]:
fdf


In [None]:
bcs = ['CGTATCCCAGGATCTGT','TATCGAACCACATCATA']
bcs2 = ['AACTATACGGGAACGCC', 'AAGTAACCAGTCGAAGA']
#bcs2 = ['AAACAACCGGTACTGAG', 'GGGGTATGAAACTTAAG']


In [None]:
fdf1 = fdf[fdf.barcode.isin(bcs)].pivot(index=[ 'sampleID'], columns='Name').reset_index()
fdf2 = (day1Cnts.reset_index()
        .melt(id_vars=['barcode', 'Name'], var_name='sampleID', value_name='log2CPM'))
fdf2 = fdf2[fdf2.barcode.isin(bcs2)].pivot(index=[ 'sampleID'], columns='Name').reset_index()
fdf2.columns =['sampleID', 'barcode_1', 'barcode_2', 'gene1', 'gene2']

In [None]:
fdf2

In [None]:
px.scatter(fdf2, x='gene1', y='gene2', hover_data=['sampleID'] )

In [None]:
countData

In [None]:
sampleData