In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Read in the variant analysis result for the isolate sequencing

In [2]:

# Define the folder path
data_path = Path("../../code/isolate_sequencing/results/filtered_variants")


In [3]:
# List all CSV files in the folder
csv_files = [file for file in data_path.glob('*.csv')]
# Read each CSV file into a dictionary of DataFrames
dataframes = {file.stem: pd.read_csv(file) for file in csv_files}


In [4]:

# name_to_file = {x.split('.')[0].replace('_iso','').replace('_anc', ''): x for x in dataframes.keys()}
name_to_file = {
    'sucB-M6': 'SucB_M6_iso.filtered',
    'aceE': 'AceE_anc.filtered',
    'sucB': 'SucB_anc.filtered',
    'aceE-M4': 'AceE_M4_iso.filtered',
    'sucB-M7': 'SucB_M7_iso.filtered',
    'aceE-M3': 'AceE_M3_iso.filtered',
    'all_samples': 'all_samples.filtered',
    'aceE-M2': 'AceE_M2_iso.filtered',
    'sucB-M5': 'SucB_M5_iso.filtered'}
aceE_evolved = ['aceE-M2', 'aceE-M3', 'aceE-M4']
sucB_evolved = ['sucB-M5', 'sucB-M6', 'sucB-M7']

## Remove the SNPs that are also in the ancestors

In [None]:
# aceE
aceE_df = dataframes[name_to_file['aceE']]
aceE_locations = aceE_df['pos'].unique()
for isolate in aceE_evolved:
    isolate_df = dataframes[name_to_file[isolate]]
    rows_to_remove = []
    for i, row in isolate_df.iterrows():
        if row['pos'] in aceE_locations:
            print(f"Removing {isolate} row {i} with pos {row['pos']}")
            rows_to_remove.append(i)
    isolate_df.drop(rows_to_remove, inplace=True)

sucB_df = dataframes[name_to_file['sucB']]
sucB_locations = sucB_df['pos'].unique()
for isolate in sucB_evolved:
    isolate_df = dataframes[name_to_file[isolate]]
    rows_to_remove = []
    for i, row in isolate_df.iterrows():
        if row['pos'] in sucB_locations:
            print(f"Removing {isolate} row {i} with pos {row['pos']}")
            rows_to_remove.append(i)
    isolate_df.drop(rows_to_remove, inplace=True)

    

Removing aceE-M2 row 1 with pos 1622605
Removing aceE-M2 row 2 with pos 1664690
Removing aceE-M2 row 4 with pos 2719426
Removing aceE-M3 row 1 with pos 1622605
Removing aceE-M3 row 2 with pos 1664690
Removing aceE-M3 row 3 with pos 2719426
Removing aceE-M4 row 96 with pos 1622605
Removing aceE-M4 row 100 with pos 1664690
Removing aceE-M4 row 162 with pos 2719426
Removing sucB-M5 row 0 with pos 447145
Removing sucB-M5 row 2 with pos 2719426
Removing sucB-M6 row 0 with pos 447145
Removing sucB-M6 row 1 with pos 2719426
Removing sucB-M7 row 0 with pos 447145
Removing sucB-M7 row 3 with pos 2719426


In [6]:
ace_M4_df = dataframes[name_to_file['aceE-M4']]
print(ace_M4_df.shape)
m4_genes = ace_M4_df.gene.unique()
pd.Series(m4_genes).to_csv(data_path / 'm4_genes.csv', index=False)

(267, 15)


In [49]:
dataframes[name_to_file['sucB-M6']]

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
2,CP009273,3081560,6450.74,186,1.0,"Substitution(type_='SNV', value='A')",186,C,snp,1,,,,CP009273.3081560.SucB_M6_iso,SucB_M6_iso
3,CP009273,3782776,6329.78,179,1.0,"Substitution(type_='SNV', value='A')",179,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M6_iso,SucB_M6_iso
4,CP009273,4209467,7773.93,220,1.0,"Substitution(type_='SNV', value='A')",220,C,snp,1,missense_variant,aceK,isocitrate dehydrogenase kinase/phosphatase,CP009273.4209467.SucB_M6_iso,SucB_M6_iso
5,CP009273,4538904,7974.41,235,0.995745,"Substitution(type_='SNV', value='T')",234,G,snp,1,missense_variant,fimH,minor component of type 1 fimbriae,CP009273.4538904.SucB_M6_iso,SucB_M6_iso


In [8]:
dataframes[name_to_file['sucB-M7']]

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
1,CP009273,752345,6971.83,198,1.0,"Substitution(type_='SNV', value='T')",198,C,snp,1,missense_variant,sdhA,"succinate dehydrogenase, flavoprotein subunit",CP009273.752345.SucB_M7_iso,SucB_M7_iso
2,CP009273,2234612,3502.85,112,1.0,"Substitution(type_='INDEL', value='AG')",112,ATACGTTGATG,del,9,disruptive_inframe_deletion,galS,galactose- and fucose-inducible galactose regu...,CP009273.2234612.SucB_M7_iso,SucB_M7_iso
4,CP009273,3782776,8405.95,238,1.0,"Substitution(type_='SNV', value='A')",238,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M7_iso,SucB_M7_iso
5,CP009273,4209467,7904.26,224,1.0,"Substitution(type_='SNV', value='A')",224,C,snp,1,missense_variant,aceK,isocitrate dehydrogenase kinase/phosphatase,CP009273.4209467.SucB_M7_iso,SucB_M7_iso


# Export all isolate data

In [59]:
aceE_isolates

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
0,CP009273,904878,7494.39,214,0.995327,"Substitution(type_='SNV', value='A')",213,G,snp,1,missense_variant,poxB,"pyruvate dehydrogenase (pyruvate oxidase), thi...",CP009273.904878.AceE_M2_iso,AceE_M2_iso
3,CP009273,2235029,5943.62,168,1.000000,"Substitution(type_='SNV', value='C')",168,A,snp,1,missense_variant,galS,galactose- and fucose-inducible galactose regu...,CP009273.2235029.AceE_M2_iso,AceE_M2_iso
5,CP009273,3528650,6552.52,186,1.000000,"Substitution(type_='SNV', value='A')",186,G,snp,1,missense_variant,envZ,sensory histidine kinase in two-component regu...,CP009273.3528650.AceE_M2_iso,AceE_M2_iso
6,CP009273,3574489,6674.87,189,1.000000,"Substitution(type_='SNV', value='T')",189,G,snp,1,,,,CP009273.3574489.AceE_M2_iso,AceE_M2_iso
0,CP009273,696432,7277.51,209,1.000000,"Substitution(type_='SNV', value='A')",209,C,snp,1,missense_variant,nagC,N-acetylglucosamine-inducible nag divergent op...,CP009273.696432.AceE_M3_iso,AceE_M3_iso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,CP009273,4532543,7297.24,206,1.000000,"Substitution(type_='SNV', value='T')",206,C,snp,1,,,,CP009273.4532543.AceE_M4_iso,AceE_M4_iso
266,CP009273,4538883,8557.67,246,0.995935,"Substitution(type_='SNV', value='T')",245,G,snp,1,missense_variant,fimH,minor component of type 1 fimbriae,CP009273.4538883.AceE_M4_iso,AceE_M4_iso
267,CP009273,4559788,8907.80,258,1.000000,"Substitution(type_='SNV', value='T')",258,C,snp,1,,,,CP009273.4559788.AceE_M4_iso,AceE_M4_iso
268,CP009273,4579955,9584.67,275,0.992727,"Substitution(type_='SNV', value='C')",273,A,snp,1,missense_variant,yjiY,putative transporter,CP009273.4579955.AceE_M4_iso,AceE_M4_iso


In [65]:
aceE_isolates = pd.concat([dataframes[name_to_file[isolate]] for isolate in aceE_evolved])
sucB_isolates = pd.concat([dataframes[name_to_file[isolate]] for isolate in sucB_evolved])

aceE_isolates[['sample', 'pos', 'chrom', 'gene','freq', 'type', 'len', 'eff', 'product']].to_csv(data_path.parent / 'aceE_variants.csv')
sucB_isolates[['sample', 'pos', 'chrom', 'gene','freq', 'type', 'len', 'eff', 'product']].to_csv(data_path.parent / 'sucB_variants.csv')

# Read in the variants from the meta sequencing

In [9]:
meta_folder = Path('../../code/meta_sequencing/results/fixed_filtered_variants')

In [34]:
# List all CSV files in the folder
meta_csv_files = [file for file in meta_folder.glob('*.csv')]
# Read each CSV file into a dictionary of DataFrames
meta_dataframes = {file.stem: pd.read_csv(file) for file in meta_csv_files}


In [35]:
meta_dataframes.keys()

dict_keys(['SucB_M6.filtered', 'AceE_M4_D44.filtered', 'SucB_M7.filtered', 'AceE_M3_D44.filtered', 'SucB_M5.filtered', 'all_samples.filtered', 'AceE_M2_D44.filtered'])

In [36]:
all_meta_df = meta_dataframes['all_samples.filtered']

In [37]:
aceE_all = all_meta_df.loc[all_meta_df['sample'].str.contains('AceE'), :]
sucB_all = all_meta_df.loc[all_meta_df['sample'].str.contains('SucB'), :]

# Remove SNPs also in the ancestor

In [39]:
#aceE
aceE_df = dataframes[name_to_file['aceE']]
aceE_locations = aceE_df['pos'].unique()
for i, row in aceE_all.iterrows():
    if row['pos'] in aceE_locations:
        print(f"Removing {row['sample']} row {i} with pos {row['pos']}, gene {row['gene']}")
        aceE_all.drop(i, inplace=True)

sucB_df = dataframes[name_to_file['sucB']]
sucB_locations = sucB_df['pos'].unique()
for i, row in sucB_all.iterrows():
    if row['pos'] in sucB_locations:
        print(f"Removing {row['sample']} row {i} with pos {row['pos']}, gene {row['gene']}")
        sucB_all.drop(i, inplace=True)

        

    

# Export spreadsheet with the most relevant data


In [64]:
sucB_all_reduced = sucB_all[['sample', 'pos', 'chrom', 'gene', 'freq', 'ref', 'alt', 'len', 'type','eff', 'product']]
sucB_all_reduced.to_csv(meta_folder.parent / 'sucB_all_fixed_filtered.csv', index=False)
aceE_all_reduced = aceE_all[['sample', 'pos', 'chrom', 'gene', 'freq', 'ref', 'alt', 'len', 'type','eff', 'product']]
aceE_all_reduced.to_csv(meta_folder.parent / 'aceE_all_fixed_filtered.csv', index=False)

In [44]:
sucB_all_reduced.loc[sucB_all_reduced['sample'].str.contains('M5')]

Unnamed: 0,sample,pos,gene,ref,alt,eff,product


In [40]:

unique_duplicated_genes = all_meta_df['gene'][all_meta_df['gene'].duplicated()].unique()

# Display the unique duplicated genes
print(unique_duplicated_genes)

[nan 'ydbD' 'galS' 'waaH' 'aceK' 'ldhA' 'ynfM' 'ilvG']


In [26]:
sucB_all.loc[sucB_all.gene=='waaH'].head(40)

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
103,CP009273,3782776,69202.2,1980,0.991414,"Substitution(type_='SNV', value='A')",1963,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M6,SucB_M6
134,CP009273,3782776,92280.5,2659,0.987965,"Substitution(type_='SNV', value='A')",2627,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M7,SucB_M7


In [22]:
sucB_all['gene'][sucB_all['gene'].duplicated()].unique()

array([nan, 'ydbD', 'galS', 'waaH', 'aceK'], dtype=object)

In [108]:
pd.Series(sucB_all.gene.unique()).to_csv(meta_folder / 'sucB_genes.csv', index=False)
pd.Series(aceE_all.gene.unique()).to_csv(meta_folder / 'aceE_genes.csv', index=False)

In [106]:
all_meta_df.loc[all_meta_df['gene'] == 'ydbD']

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
47,CP009273,1470490,57737.3,1705,0.980059,"Substitution(type_='SNV', value='A')",1671,C,snp,1,missense_variant,ydbD,PF10971 family putative periplasmic methylglyo...,CP009273.1470490.SucB_M6,SucB_M6
48,CP009273,1471105,62035.6,1818,0.981848,"Substitution(type_='SNV', value='C')",1785,A,snp,1,missense_variant,ydbD,PF10971 family putative periplasmic methylglyo...,CP009273.1471105.SucB_M6,SucB_M6
