In [None]:
import pandas as pd
import pyranges as pr
import plotly.express as px
from pathlib import Path

In [None]:
%pwd


In [None]:
df = pd.read_csv("featureCounts.csv", index_col = 0)
sds = pd.read_csv("rnaseq_metatdata.csv")

In [None]:
df = df.merge(sds, on='sample_id')

In [None]:
sum_df = df.groupby(['sample_id', 'genome']).read_counts.sum().reset_index()

In [None]:
sum_df = sum_df.merge(sds, on='sample_id')

In [None]:
int_df = sum_df[['sample_id', 'genome', 'read_counts']].pivot(index='genome', columns='sample_id')

In [None]:
int_df = int_df/int_df.sum()*100

In [None]:
int_df.columns = [c[1] for c in int_df.columns]

In [None]:
sum_df = sum_df.merge(int_df.reset_index().melt(id_vars=['genome'], var_name ='sample_id', value_name='perc_reads'),
                      on=['sample_id', 'genome'])

In [None]:
sum_df

In [None]:
# Split into 3 experiments, graph sep
exp1 = sum_df[sum_df.Mouse == 'invitro']
exp2 = sum_df[sum_df.Mouse == 'Oligo']
exp3 = sum_df[sum_df.Mouse == 'LCM']

In [None]:
exp1

In [None]:
px.bar(exp1, x='sample_id', y='read_counts', color='genome', log_y=True)

In [None]:
exp3

In [None]:
df.head()

In [None]:
sl = df[df.genome  == "SL1344"]
int_df = (sl[['locus_tag', 'read_counts', 'sample_id']]
          .drop_duplicates()
          .pivot(index='locus_tag', columns='sample_id'))
int_df.columns = [c[1] for c in int_df.columns]
int_df = int_df/int_df.sum()*1000000
int_df = (int_df.reset_index()
          .melt(id_vars='locus_tag', var_name='sample_id', value_name = 'TPM'))
sl = sl.merge(int_df, on=['locus_tag', 'sample_id'])

In [None]:
sl.iloc[48884,:]

In [None]:
test = sl[sl.sample_id == 'M9_aero']
top10 = sl.groupby(['sample_id']).TPM.nlargest(10).reset_index().set_index('level_1')

top10 = top10.merge(sl[['locus_tag', 'ID', 'Name', 'Mouse']], left_index=True, right_index=True, how='left')

In [None]:
inoc = ['Inoc_1', 'Inoc_2']

In [None]:
inoc_df = sl[sl.sample_id.isin(inoc)]
inoc_df = inoc_df.groupby('Name').TPM.median().reset_index().sort_values('TPM', ascending=False).head(25)

In [None]:
sl.sample_id.unique()

In [None]:
m9_df = sl[sl.sample_id.isin(['Inoc_2'])]
m9_df = m9_df.groupby('Name').TPM.median().reset_index().sort_values('TPM', ascending=False).head(25)

In [None]:
m9_df

In [None]:
order = inoc_df.Name.values

In [None]:
px.box(sl[(sl.sample_id.isin(inoc)) &(sl.Name.isin(order))], x='Name', y='TPM', 
       category_orders={'Name': order},
      title='Highest expressed genes in the inoculum samples (Inoc1,2)')

In [None]:
hi = list(sl[sl.sample_id.isin(['Inoc_3'])].sort_values('TPM', ascending=False).head(20).Name.values)  +list(order)

In [None]:
px.bar(sl[sl.sample_id.isin(['Inoc_3'])].sort_values('TPM', ascending=False).head(20), x='Name', y='TPM', 
      title='Highest expressed genes in the inoculum samples (Inoc3)')

In [None]:
au_samples = [c for c in sl.sample_id.unique() if c.startswith('AU')]
au_samples

In [None]:
sl['in_top_25'] = sl.Name.apply(lambda x: True if x in hi else False)

In [None]:
au_samples = [c for c in sl.sample_id.unique() if c.startswith('AU')]
au_samples

In [None]:
au_samples = [c for c in sl.sample_id.values if c.startswith('AU')]
sample= 'AU655'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25', 
       category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'AU650'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'AU654'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'AU648'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'AU658'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sl.sample_id.unique()

In [None]:
sample= 'aw599D1'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'aw137D3'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'aw597D1'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'aw143D2'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
sample= 'aw933D4'
px.bar(sl[sl.sample_id == sample].sort_values("TPM", ascending=False).head(20), x='Name', y='TPM', 
      title=f'Highest expressed genes in the {sample}', color='in_top_25',
      category_orders = {'in_top_25': [True, False]})

In [None]:
px.bar(top10[top10.Mouse == 'invitro'], x='Name', y='TPM', facet_col='sample_id',
      facet_col_wrap=2)

In [None]:
# px.bar(top10[top10.Mouse == 'LCM'], x='Name', y='TPM', facet_col='sample_id',
#       facet_col_wrap=2, height=2000)

In [None]:
# Graph total number of reads for each sample, color by treatment
px.bar(exp2, x='sam`ple_id', y='read_counts', color='genome', log_y=True)

In [None]:
summary = pd.read_csv("05-05-23-mapping-summary.csv", index_col=0)

In [None]:
exp4 = exp3.merge(summary, on='sample_id')

In [None]:
exp4['genome_over_total'] = exp4['read_counts']/exp4['total']*100

In [None]:
o = exp4[exp4.genome == 'SL1344'].sort_values('genome_over_total').sample_id.values

In [None]:
exp4[exp4.genome == 'SL1344'].sort_values('genome_over_total')

In [None]:
# D3 and D4 look like they have significantly fewer bacterial reads
px.bar(exp3, x='sample_id', y='read_counts', color='genome', log_y=False, category_orders = {'sample_id':o})

In [None]:
exp3.head()

In [None]:
df[(df.genome== 'SL1344') & (df.sample_id == 'aw933D4')].sort_values('read_counts', ascending=False).head(50)

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values('Treatment'), x='Treatment', y='read_counts', 
      points='all', hover_data=['sample_id'], height=600, width=600, template='plotly_white')

In [None]:
px.box(exp3[exp3.genome == 'SL1344'].sort_values('Treatment'), x='Treatment', y='perc_reads', 
      points='all', hover_data=['sample_id'], height=600, width=600, template='plotly_white')

In [None]:
px.box(exp3[exp3.genome == 'YL58'].sort_values('Treatment'), x='Treatment', y='perc_reads', 
      points='all', hover_data=['sample_id'], height=600, width=600, template='plotly_white')

In [None]:
px.bar(exp3[exp3.genome == 'SL1344'].sort_values('read_counts'), x='sample_id', y='read_counts', 
      color='Treatment')

In [None]:
px.bar(exp3[exp3.genome == 'SL1344'].sort_values('perc_reads'), x='sample_id', y='perc_reads', 
      color='Treatment', hover_data=['Treatment', 'read_counts'])

In [None]:
exp3.groupby(['genome', 'sample_id']).perc_reads.median()

In [None]:
# saturation curve code...

In [None]:
andf = df[~df['product'].isna()]

In [None]:
annot = pr.read_gff3("ASF_Salmonella.gff").as_df()

In [None]:
annot = annot[~annot['product'].isna()]

In [None]:
annot.columns

In [None]:
ribos = annot[annot['product'].str.contains('ribosomal')].locus_tag.dropna().values

In [None]:
ribos

In [None]:
ribo_df = df[df.locus_tag.isin(ribos)]

In [None]:
ribo_df = ribo_df.groupby(['sample_id', 'genome']).read_counts.sum().reset_index()
ribo_df.columns = ['sample_id', 'genome', 'ribo_counts']

In [None]:
sum_df = sum_df.merge(ribo_df, on=['sample_id', 'genome'])

In [None]:
sum_df['perc_ribo'] = sum_df['ribo_counts']/sum_df['read_counts']

In [None]:
sum_df

In [None]:
px.box(sum_df[sum_df.genome == 'SL1344'], x='Treatment', y='perc_ribo', color='Treatment', 
      points='all', hover_data=['sample_id'])

In [None]:
px.box(sum_df[sum_df.genome == 'YL58'], x='Treatment', y='perc_ribo', color='Treatment', 
      points='all', hover_data=['sample_id', ])

In [None]:
data = ">Rosalind_6404\n
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG\n
>Rosalind_5959\n
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC\n
>Rosalind_0808\n
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT"

In [None]:
data = """
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT
"""
print(data)

In [None]:
a = ["GATTACA", 
"TACTACTAC",
"ATTGAT",
"GAAGA"]

In [None]:
len("".join(a))

In [None]:
from collections import Counter

In [None]:
b = Counter({s:len(s) for s in a})
b.most_common()

In [None]:
t = 0
for s,l in b.most_common():
    t+=l
    if t < 27*.75:
        continue
    else:
        print(l)
        break

In [None]:
27//2

In [None]:
27/2