# Load settings and configs

In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml
import requests

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))
import pyranges as pr

import sys
sys.path.append("..")

In [None]:
from snippets.utils import *

In [None]:
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)['default']
root = Path(config_dict["root"])
out_dir = root/config_dict['output_dir']

# 16S Data Analysis

In [None]:
s_data = pd.read_table(config_dict["tax_composition_file"])
taxa = pd.read_csv(out_dir/"17-08-23-LCM-taxa-dada2.csv").rename(columns={'Unnamed: 0':'seq'})
s_data = s_data.merge(taxa, on='seq')
s_data[['family_idtaxa', 'genus_idtaxa']] = s_data.tax.str.split("|", expand=True)[[5,6]]
s_data = s_data[['asv', 'family_idtaxa', 'genus_idtaxa', 'Family', 'Genus'] + [c for c in s_data.columns if 'METAB' in c]]
s_data.to_csv(out_dir/"08-23-LCM-16S-family-genus.csv", index=False)


"""
From inspection of the data and previous information, came to the following taxonomic assignments
"""

asv_id = {'asv_0001': "Salmonella", 
        'asv_0002': 'ASF519',
        'asv_0003': 'YL58',
        'asv_0004': 'YL32',
        'asv_0005': 'unclassified Lachnospiraceae',
        'asv_0006': 'YL31',
        'asv_0007': 'Turicibacter',
        'asv_0008': 'Clostridium indolis Y18184 (?)',
        'asv_0009': 'Clostridium indolis Y18184 (?)',
        'asv_0012': 'Staphylococcus'}
        

## Normalising to get relative abundance

In [None]:
cols = list(s_data.select_dtypes(include=np.number).columns)
s_data[cols] = s_data[cols]/s_data[cols].sum()*100
s_data['asv'] = s_data['asv'].str.split(';', expand = True)[0]
s_data['genome'] = s_data['asv'].replace(asv_id)
s_data = s_data[['genome'] + cols]
# Only want to look at identified ASVs
s_data = s_data[~s_data.genome.str.startswith('asv')]
s_data = s_data.melt(id_vars=['genome'], var_name='sample_id', value_name='RelAb')
# Adding together asv8 and asv9 -> previous analysis (Melanie) suggested they are the same thing
s_data = s_data.groupby(['sample_id', 'genome']).RelAb.sum().reset_index()
s_data['sample_id'] = s_data['sample_id'].str.split("_", expand=True)[1]
s_data['day'] = s_data['sample_id'].str[-2:]

## Plot change in relative abundance by day across species

In [None]:
mean_data = s_data.groupby(['day', 'genome']).RelAb.mean().reset_index()
px.line(mean_data, x='day', y = 'RelAb', color='genome', markers=True, template='plotly_white', color_discrete_map = lcm_colors,
width=700, height=600, labels={'RelAb': 'Relative abundance (%)'})

## Look at relative abundance in each mouse

In [None]:
px.bar(s_data.sort_values('day'), x='sample_id', y='RelAb', color='genome', template='plotly_white', height=600, width=1000,
color_discrete_map = lcm_colors,  labels={'RelAb': 'Relative abundance (%)'})

# PCA of Salmonella + LCM transcriptome

In [None]:
norm_counts = out_dir/config_dict["norm_counts_file"]
sample_data_file = root/config_dict['sample_data_file']
sd = pd.read_csv(sample_data_file)
norm_counts = pd.read_csv(norm_counts, index_col=0)
norm_counts = norm_counts.set_index('ID')
norm_counts = norm_counts[norm_counts.sum(axis=1) > 100]
norm_counts = np.log2(norm_counts+1)

In [None]:
pc_df, pc_var = find_pcs(norm_counts, num_genes=500)
pc_df = pc_df.reset_index().rename(columns={'index':'sample_id'}).merge(sd, on='sample_id')

In [None]:
fig = px.scatter(pc_df, x='PC1', y='PC2', color='Treatment', width=700, height=600, 
template='plotly_white', hover_data=['sample_id'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# Differential Expression Analysis

- Combining day 1 and day 2

In [None]:
res = pd.read_csv(out_dir/"2023-08-15_lcm-alone-within-taxon-D1_vs_PBS_D1_l0a0.01_results.csv").assign(day='day1')
res2 = pd.read_csv(out_dir/"2023-08-15_lcm-alone-within-taxon-D2_vs_PBS_D1_l0a0.01_results.csv").assign(day='day2')
gff = pr.read_gff3(root/config_dict['gff_file']).as_df()
gff = gff[gff.Feature == 'gene']
res = res.merge(gff, on='ID', how='left')
res2 = res2.merge(gff, on='ID', how='left')
res = pd.concat([res, res2])
res['genome'] = res['Chromosome'].replace(genome_map)
columns_to_report = ['ID', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj', 'day', 'Chromosome', 'Start', 'End', 'Strand', 'Name', 'locus_tag', 'genome']
res = res[columns_to_report]
# Remove SL1344 from the analysis, not sure how to interpret these results
res = res.query("genome != 'SL1344'")
res = res.dropna(subset=['log2FoldChange', 'padj'])
sres = res.query("abs(log2FoldChange) > 1 & padj < 0.05")

In [None]:
res.to_csv(out_dir/"08-23_Sal-LCM-d1-d2_all-results.csv")
sres.to_csv(out_dir/"08-23_Sal-LCM-d1-d2_significant-results.csv")

## YL32

In [None]:
strain = 'YL32'
df32, lup32, ldown32, funcup32, funcdown32 = process_strain(res, strain, ncbi_taxid_map)

## YL58

In [None]:
strain = 'YL58'
df58, lup58, ldown58, funcup58, funcdown58 = process_strain(res, strain, ncbi_taxid_map)

In [None]:
func_graph(funcdown58, c='Reds_r')

# Compare Oligos and LCM mice

In [None]:
df = pd.read_csv(out_dir/"2023-07-31_oligo-lcm-within-taxon-Oligo_PBS_vs_LCM_PBS_D1_l0a0.01_results.csv")
df = df.merge(gff, on='ID')
df['genome'] = df['Chromosome'].replace(genome_map)

In [None]:
upregulated_oligo = df.query("genome == 'YL32' & log2FoldChange > 1 & padj < 0.05").locus_tag.values
link_to_string(upregulated_oligo, ncbi_taxid_map['YL32'])

In [None]:
downregulated_oligo = df.query("genome == 'YL32' & log2FoldChange < - 1 & padj < 0.05").locus_tag.values
link_to_string(downregulated_oligo, ncbi_taxid_map['YL32'])

## Load counts normalised using both sets of mice

In [None]:
norm_counts = pd.read_csv(out_dir/"2023-08-17_oligo-lcm-within-taxon-norm_cnts.csv", index_col=0)
norm_counts = norm_counts.set_index('ID')
norm_counts = norm_counts[norm_counts.sum(axis=1) > 100]
norm_counts = np.log2(norm_counts+1)
norm_counts = norm_counts.reset_index()

In [None]:
frs_genes = [f'gene-A4V08_{c}' for c in ['11185', '11195', '11255', '11215', '11265','11190', '11235', '11205','11230']]

with open(out_dir/'YL32_fructoselysine_genes.txt', 'w') as fo:
    for g in frs_genes:
        fo.write(f'{g}\n')

# Run seqtk subseq to get the sequences 

In [None]:
fdf = norm_counts[norm_counts.ID.isin(frs_genes)].melt(id_vars='ID', var_name='sample_id', value_name='log(norm cnts)').merge(sd, on='sample_id')
fdf['ID'] = fdf['ID'].str.replace('gene-', '')
fdf = fdf.query("(Mouse == 'Oligo' | Mouse == 'LCM') & (Treatment == 'PBS_D1' | Treatment == 'PBS')")

In [None]:
px.box(fdf.sort_values('ID'), x='ID', y='log(norm cnts)', points='all',color='Mouse', template='plotly_white')