# Process bulk counts table

1. Annotate columns (samples) in a way that time point and replicate are annotated
2. Remove rRNA genes
3. Perform TPM normalization and alternatively also raw counts

In [1]:
# Use miniconda environment Jupyter_new for running this notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from tools import *
import warnings
warnings.filterwarnings('ignore')

## 1 Load the dataset and annotation

In [2]:
bulkPath = '../nf_output/countData/countData.tsv'
metaPath = '../Leskinen_SraRunTable.csv' # metadata from SRA
gffPath = '../nf_output/alignments/dualGenome.gff3' # output from nf pipeline

In [3]:
# Load data
df_initial = pd.read_csv(bulkPath, sep = '\t', comment='#', index_col=0)
metadata = pd.read_csv(metaPath)

In [4]:
df_initial

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,SRR3110979_sorted.bam,SRR3110978_sorted.bam,SRR3110971_sorted.bam,SRR3110976_sorted.bam,SRR3110977_sorted.bam,SRR3110975_sorted.bam,SRR3110972_sorted.bam,SRR3110970_sorted.bam,SRR3110973_sorted.bam,SRR3110974_sorted.bam
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
gene-Y11_RS00005,NC_017564.1,13,720,+,708,942,1014,1126,1027,1287,896,1260,2219,1088,999
gene-Y11_RS00010,NC_017564.1,768,1121,-,354,895,1079,1602,1332,1548,1274,1803,2681,2071,2085
gene-Y11_RS00015,NC_017564.1,1133,2659,-,1527,3419,3304,5608,4173,4676,4367,6104,9865,6212,6030
gene-Y11_RS00020,NC_017564.1,3012,4076,+,1065,2199,2200,928,2031,2447,1959,1232,1273,1901,2277
gene-Y11_RS00025,NC_017564.1,4441,4911,-,471,4178,4353,8626,5700,5432,6333,8221,11799,9405,8311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gene-phiR1-37_gp362,NC_016163.1,261054,261311,+,258,3290,2994,1180,3710,4073,4639,2823,0,4960,4371
gene-phiR1-37_gp363,NC_016163.1,261308,261562,+,255,2988,2660,1044,3280,3658,4427,2572,1,4878,4247
gene-phiR1-37_gp364,NC_016163.1,261543,261836,+,294,4393,4207,1202,4068,4775,4911,2808,3,5375,4340
gene-phiR1-37_gp365,NC_016163.1,261848,262075,+,228,4530,5228,1192,4216,5638,4281,3014,5,5399,4312


In [5]:
metadata

Unnamed: 0,Run,Assay Type,AvgSpotLen,bacterial_strain,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study
0,SRR3110970,RNA-Seq,202,rough derivative of the serotype O:3,7260348236,PRJNA309384,SAMN04432267,4573838725,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:38:00Z,1,GSM2043432,bacterium and bacteriophage,SRP068705
1,SRR3110971,RNA-Seq,202,rough derivative of the serotype O:3,5928925028,PRJNA309384,SAMN04432268,4173003752,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:37:00Z,1,GSM2043433,bacterium and bacteriophage,SRP068705
2,SRR3110972,RNA-Seq,202,rough derivative of the serotype O:3,6895537650,PRJNA309384,SAMN04432269,4848605427,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:25:00Z,1,GSM2043434,bacterium and bacteriophage,SRP068705
3,SRR3110973,RNA-Seq,202,rough derivative of the serotype O:3,8593758720,PRJNA309384,SAMN04432270,6045938494,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:29:00Z,1,GSM2043435,bacterium and bacteriophage,SRP068705
4,SRR3110974,RNA-Seq,202,rough derivative of the serotype O:3,9674268436,PRJNA309384,SAMN04432271,6790618356,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:54:00Z,1,GSM2043436,bacterium and bacteriophage,SRP068705
5,SRR3110975,RNA-Seq,202,rough derivative of the serotype O:3,7106122852,PRJNA309384,SAMN04432272,4992084132,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:16:00Z,1,GSM2043437,bacterium and bacteriophage,SRP068705
6,SRR3110976,RNA-Seq,186,rough derivative of the serotype O:3,6104684052,PRJNA309384,SAMN04432273,4020768584,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:23:00Z,1,GSM2043438,bacterium and bacteriophage,SRP068705
7,SRR3110977,RNA-Seq,186,rough derivative of the serotype O:3,6473153772,PRJNA309384,SAMN04432274,4254941362,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:27:00Z,1,GSM2043439,bacterium and bacteriophage,SRP068705
8,SRR3110978,RNA-Seq,186,rough derivative of the serotype O:3,5044442388,PRJNA309384,SAMN04432275,3328426264,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:15:00Z,1,GSM2043440,bacterium and bacteriophage,SRP068705
9,SRR3110979,RNA-Seq,186,rough derivative of the serotype O:3,4690130616,PRJNA309384,SAMN04432276,3086419707,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:22:00Z,1,GSM2043441,bacterium and bacteriophage,SRP068705


## 2 Format the dataset

### 2.1 Annotate sample names

Issue now is that the metadata does not properly annotate sample names. Thus, this will be done manually by adding another sample name column to the metadata.

In [6]:
# Match GSM IDs and SampleNames inferred from GEO

sampleDict = {'GSM2043432': '0_R1',  'GSM2043433': '2_R1',  'GSM2043434': '5_R1',  'GSM2043435': '10_R1',
              'GSM2043436': '15_R1', 'GSM2043437': '21_R1', 'GSM2043438': '28_R1', 'GSM2043439': '35_R1',
              'GSM2043440': '42_R1', 'GSM2043441': '49_R1'}

In [7]:
metadata.head()

Unnamed: 0,Run,Assay Type,AvgSpotLen,bacterial_strain,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study
0,SRR3110970,RNA-Seq,202,rough derivative of the serotype O:3,7260348236,PRJNA309384,SAMN04432267,4573838725,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:38:00Z,1,GSM2043432,bacterium and bacteriophage,SRP068705
1,SRR3110971,RNA-Seq,202,rough derivative of the serotype O:3,5928925028,PRJNA309384,SAMN04432268,4173003752,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:37:00Z,1,GSM2043433,bacterium and bacteriophage,SRP068705
2,SRR3110972,RNA-Seq,202,rough derivative of the serotype O:3,6895537650,PRJNA309384,SAMN04432269,4848605427,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:25:00Z,1,GSM2043434,bacterium and bacteriophage,SRP068705
3,SRR3110973,RNA-Seq,202,rough derivative of the serotype O:3,8593758720,PRJNA309384,SAMN04432270,6045938494,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:29:00Z,1,GSM2043435,bacterium and bacteriophage,SRP068705
4,SRR3110974,RNA-Seq,202,rough derivative of the serotype O:3,9674268436,PRJNA309384,SAMN04432271,6790618356,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:54:00Z,1,GSM2043436,bacterium and bacteriophage,SRP068705


In [8]:
metadataFull = annotateData(metadata, sampleDict)

In [9]:
metadata.head(1)

Unnamed: 0,Run,Assay Type,AvgSpotLen,bacterial_strain,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study
0,SRR3110970,RNA-Seq,202,rough derivative of the serotype O:3,7260348236,PRJNA309384,SAMN04432267,4573838725,GEO,public,...,cDNA,TRANSCRIPTOMIC,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:38:00Z,1,GSM2043432,bacterium and bacteriophage,SRP068705


In [10]:
metadataFull.head(3)

Unnamed: 0_level_0,Run,Assay Type,AvgSpotLen,bacterial_strain,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,SampleID,SampleNames
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR3110970_sorted.bam,SRR3110970,RNA-Seq,202,rough derivative of the serotype O:3,7260348236,PRJNA309384,SAMN04432267,4573838725,GEO,public,...,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:38:00Z,1,GSM2043432,bacterium and bacteriophage,SRP068705,SRR3110970_sorted.bam,0_R1
SRR3110971_sorted.bam,SRR3110971,RNA-Seq,202,rough derivative of the serotype O:3,5928925028,PRJNA309384,SAMN04432268,4173003752,GEO,public,...,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-22T19:37:00Z,1,GSM2043433,bacterium and bacteriophage,SRP068705,SRR3110971_sorted.bam,2_R1
SRR3110972_sorted.bam,SRR3110972,RNA-Seq,202,rough derivative of the serotype O:3,6895537650,PRJNA309384,SAMN04432269,4848605427,GEO,public,...,Yersinia enterocolitica,ILLUMINA,2016-01-24T00:00:00Z,2016-01-21T09:25:00Z,1,GSM2043434,bacterium and bacteriophage,SRP068705,SRR3110972_sorted.bam,5_R1


Add correct sample names.

In [11]:
df = changeColnames(df_initial.iloc[:,5:df_initial.shape[1]], metadataFull)
df = df[['0_R1', '2_R1', '5_R1', '10_R1', '15_R1', '21_R1', '28_R1', '35_R1', '42_R1', '49_R1']]
df

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gene-Y11_RS00005,2219,1126,1260,1088,999,896,1027,1287,1014,942
gene-Y11_RS00010,2681,1602,1803,2071,2085,1274,1332,1548,1079,895
gene-Y11_RS00015,9865,5608,6104,6212,6030,4367,4173,4676,3304,3419
gene-Y11_RS00020,1273,928,1232,1901,2277,1959,2031,2447,2200,2199
gene-Y11_RS00025,11799,8626,8221,9405,8311,6333,5700,5432,4353,4178
...,...,...,...,...,...,...,...,...,...,...
gene-phiR1-37_gp362,0,1180,2823,4960,4371,4639,3710,4073,2994,3290
gene-phiR1-37_gp363,1,1044,2572,4878,4247,4427,3280,3658,2660,2988
gene-phiR1-37_gp364,3,1202,2808,5375,4340,4911,4068,4775,4207,4393
gene-phiR1-37_gp365,5,1192,3014,5399,4312,4281,4216,5638,5228,4530


### 2.2 Remove rRNA genes

Consult gff3 file to get rRNA geneids.

In [12]:
# Load gff3 and split into genes and CDS dfs
gff3 = pd.read_csv(gffPath, sep='\t', header = None, skiprows = 5)
gff3.columns=["seq_id", "source", "type", "start", "end", "phase", "strand", "score", "attributes"]
gff3_genes = gff3.loc[gff3["type"] == 'gene']

# Column formating for genes
gff3_genes = gff3_genes.reset_index(drop=True)
dct_genes = gff3_genes["attributes"].str.split(';').apply(lambda items: dict(item.split('=', 1) for item in items if '=' in item))
cols_to_keep = ['ID', 'Name', 'gbkey', 'gene_biotype', 'locus_tag', 'gene']
gff3_genes = pd.concat([gff3_genes, pd.json_normalize(dct_genes)[cols_to_keep]], axis=1)

# Generate locus_tag, product dictonary over all different feature types
attrs = gff3["attributes"].str.split(";", expand=True)
attrs_dicts = attrs.apply(lambda row: {item.split("=")[0]: item.split("=")[1] for item in row if "=" in str(item)}, axis=1)
attrs_df = pd.json_normalize(attrs_dicts)
attrs_df = attrs_df.dropna(subset=["locus_tag", "product"])
locus_product_dict = dict(zip(attrs_df["locus_tag"], attrs_df["product"]))

# Generate Parent, product dictonary over all different feature types
attrs_parent_df = attrs_df.dropna(subset=["Parent", "product"])
parent_product_dict = dict(zip(attrs_parent_df["Parent"], attrs_parent_df["product"]))

# Add gene product, if not stated in gff3, fill with gene_biotype
gff3_genes["product"] = gff3_genes["locus_tag"].map(locus_product_dict)
gff3_genes["product"] = gff3_genes["ID"].map(parent_product_dict)
# gff3_genes["product"] = gff3_genes["product"].fillna("other")
gff3_genes["product"] = gff3_genes["product"].fillna(gff3_genes["gene_biotype"])

# If gene = NA, take from ID column
gff3_genes["gene"] = gff3_genes["gene"].fillna(gff3_genes["ID"])

# Drop attributes column
gff3_genes = gff3_genes.drop(["attributes"], axis=1)

In [13]:
pharokka_path = "../../../2025-12_reannotation_phage_genomes/Pharokka_proteins_phages_out/NC_016163.1_out/pharokka_proteins_full_merged_output.tsv"
gff3_genes = add_pharokka(gff3_genes, pharokka_path)
gff3_genes.loc[gff3_genes['seq_id'] == "NC_016163.1"]

Unnamed: 0,seq_id,source,type,start,end,phase,strand,score,ID,Name,gbkey,gene_biotype,locus_tag,gene,product,annot,PHROG,category
4003,NC_016163.1,RefSeq,gene,2.0,262389.0,.,+,.,gene-phiR1-37_gp367,g367,Gene,protein_coding,phiR1-37_gp367,g367,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4004,NC_016163.1,RefSeq,gene,103.0,375.0,.,+,.,gene-phiR1-37_gp001,g001,Gene,protein_coding,phiR1-37_gp001,g001,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4005,NC_016163.1,RefSeq,gene,326.0,577.0,.,+,.,gene-phiR1-37_gp002,g002,Gene,protein_coding,phiR1-37_gp002,g002,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4006,NC_016163.1,RefSeq,gene,564.0,812.0,.,+,.,gene-phiR1-37_gp003,g003,Gene,protein_coding,phiR1-37_gp003,g003,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4007,NC_016163.1,RefSeq,gene,784.0,1002.0,.,+,.,gene-phiR1-37_gp004,g004,Gene,protein_coding,phiR1-37_gp004,g004,hypothetical protein,hypothetical protein,No_PHROG,unknown function
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4370,NC_016163.1,RefSeq,gene,261054.0,261311.0,.,+,.,gene-phiR1-37_gp362,g362,Gene,protein_coding,phiR1-37_gp362,g362,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4371,NC_016163.1,RefSeq,gene,261308.0,261562.0,.,+,.,gene-phiR1-37_gp363,g363,Gene,protein_coding,phiR1-37_gp363,g363,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4372,NC_016163.1,RefSeq,gene,261543.0,261836.0,.,+,.,gene-phiR1-37_gp364,g364,Gene,protein_coding,phiR1-37_gp364,g364,hypothetical protein,hypothetical protein,No_PHROG,unknown function
4373,NC_016163.1,RefSeq,gene,261848.0,262075.0,.,+,.,gene-phiR1-37_gp365,g365,Gene,protein_coding,phiR1-37_gp365,g365,hypothetical protein,hypothetical protein,No_PHROG,unknown function


In [14]:
attrs_parent_df["Parent"]

2          gene-Y11_RS00005
4          gene-Y11_RS00010
6          gene-Y11_RS00015
8          gene-Y11_RS00020
10         gene-Y11_RS00025
               ...         
9881    gene-phiR1-37_gp362
9884    gene-phiR1-37_gp363
9887    gene-phiR1-37_gp364
9889    gene-phiR1-37_gp365
9892    gene-phiR1-37_gp366
Name: Parent, Length: 4722, dtype: object

In [15]:
gff3_genes[gff3_genes["seq_id"] == "NC_017564.1"]

Unnamed: 0,seq_id,source,type,start,end,phase,strand,score,ID,Name,gbkey,gene_biotype,locus_tag,gene,product,annot,PHROG,category
0,NC_017564.1,RefSeq,gene,13.0,720.0,.,+,.,gene-Y11_RS00005,hda,Gene,protein_coding,Y11_RS00005,hda,DnaA inactivator Hda,DnaA inactivator Hda,No_PHROG,protein_coding
1,NC_017564.1,RefSeq,gene,768.0,1121.0,.,-,.,gene-Y11_RS00010,arsC,Gene,protein_coding,Y11_RS00010,arsC,arsenate reductase (glutaredoxin),arsenate reductase (glutaredoxin),No_PHROG,protein_coding
2,NC_017564.1,RefSeq,gene,1133.0,2659.0,.,-,.,gene-Y11_RS00015,Y11_RS00015,Gene,protein_coding,Y11_RS00015,gene-Y11_RS00015,tetratricopeptide repeat protein,tetratricopeptide repeat protein,No_PHROG,protein_coding
3,NC_017564.1,RefSeq,gene,3012.0,4076.0,.,+,.,gene-Y11_RS00020,Y11_RS00020,Gene,protein_coding,Y11_RS00020,gene-Y11_RS00020,AI-2E family transporter,AI-2E family transporter,No_PHROG,protein_coding
4,NC_017564.1,RefSeq,gene,4441.0,4911.0,.,-,.,gene-Y11_RS00025,bcp,Gene,protein_coding,Y11_RS00025,bcp,thioredoxin-dependent thiol peroxidase,thioredoxin-dependent thiol peroxidase,No_PHROG,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3928,NC_017564.1,RefSeq,gene,4548674.0,4549219.0,.,-,.,gene-Y11_RS20670,speG,Gene,protein_coding,Y11_RS20670,speG,spermidine N1-acetyltransferase,spermidine N1-acetyltransferase,No_PHROG,protein_coding
3929,NC_017564.1,RefSeq,gene,4549355.0,4549993.0,.,-,.,gene-Y11_RS20675,purN,Gene,protein_coding,Y11_RS20675,purN,phosphoribosylglycinamide formyltransferase,phosphoribosylglycinamide formyltransferase,No_PHROG,protein_coding
3930,NC_017564.1,RefSeq,gene,4550034.0,4551077.0,.,-,.,gene-Y11_RS20680,purM,Gene,protein_coding,Y11_RS20680,purM,phosphoribosylformylglycinamidine cyclo-ligase,phosphoribosylformylglycinamidine cyclo-ligase,No_PHROG,protein_coding
3931,NC_017564.1,RefSeq,gene,4551311.0,4551937.0,.,+,.,gene-Y11_RS20685,upp,Gene,protein_coding,Y11_RS20685,upp,uracil phosphoribosyltransferase,uracil phosphoribosyltransferase,No_PHROG,protein_coding


In [16]:
# Load ggf3 file

gff3 = pd.read_csv(gffPath, sep='\t', header = None, skiprows = 5)
gff3 = gff3.loc[gff3.iloc[:,2] == 'gene']


# Format some new columns
gff3['ID'] = pd.DataFrame(gff3.iloc[:,8].str.split('ID=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]
gff3['GeneType'] = pd.DataFrame(gff3.iloc[:,8].str.split('gene_biotype=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]
gff3['Symbol'] = pd.DataFrame(gff3.iloc[:,8].str.split('gene=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]

# Add entity host and phage
entity = {'NC_017564.1' : 'host', 'NC_017565.1.1' : 'host', 'NC_016163.1' : 'phage'}
gff3['Entity'] = gff3[0].map(entity)
gff3.index = gff3['ID']
rRNAs = gff3.loc[gff3['GeneType'] == 'rRNA', 'ID'].tolist()

Perform in silico rRNA depletion.

In [17]:
df_norRNAs = rRNAdepletion(df,rRNAs)
df_norRNAs

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gene-Y11_RS20515,395,106,156,363,264,316,420,483,522,358
gene-Y11_RS20385,5794,3861,4110,5568,5851,4211,3894,3775,2735,2599
gene-phiR1-37_gp074,6,1167,1965,5133,6251,6902,9850,11519,10874,12482
gene-Y11_RS01175,4398,2145,3540,3302,3122,2468,2025,2056,1931,1635
gene-Y11_RS09305,959,1021,1166,1317,1788,1367,1327,1404,1167,1202
...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,32810,23304,26234,35456,33127,21715,22206,21446,17168,15148
gene-Y11_RS15790,2691,1561,1894,1903,1841,1217,1224,1144,919,782
gene-Y11_RS19470,3138,2253,2257,2804,2669,2032,2246,2181,1825,1719
gene-Y11_RS12990,378,166,192,466,716,622,734,896,473,642


### 2.3 Read count normalization

Important note: gene symbols not available for most genes.

In [18]:
# Function to fill in missing symbols by geneid.

def fillSymbols(df):
    df_new = df.copy()
    index = df.index.to_list()
    for i in range(0,df.shape[0]):
        if (df.iloc[i,-1:].values == None):
            df_new.iloc[i,-1:] = index[i]
    return df_new

Convert counts to TPM.

But gff3 file has duplicated gene name, thus standard entity/symbol mapping not possible. One row will be removed from the gff3 file, as this is not relevant for mapping.

In [19]:
#gff3 = gff3.loc[~gff3.index.duplicated(keep='first')]
#gff3.shape[0]

In [20]:
tpms = TPM(df_norRNAs, df_initial, 0.5)

tpms['Entity'] = gff3.loc[sorted(tpms.index.to_list()), 'Entity']
tpms['Symbol'] = gff3.loc[sorted(tpms.index.to_list()), 'Symbol']

tpms = fillSymbols(tpms)
tpms = make_unique_with_index(tpms)
tpms

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,11.096691,3.328399,4.196941,7.459205,4.759018,7.452089,10.404549,11.329048,15.861196,11.490066,host,gene-Y11_RS20515
gene-Y11_RS20385,103.808535,77.056956,70.385506,72.961950,67.224717,63.315671,61.528884,56.486089,53.022042,53.197750,host,sseB
gene-phiR1-37_gp074,0.208900,41.794628,60.376704,120.664507,128.840907,186.160952,279.185832,309.177891,378.126221,458.260882,phage,g074
gene-Y11_RS01175,48.592827,26.401877,37.385540,26.684036,22.121506,22.885374,19.733770,18.973446,23.086862,20.639752,host,katA
gene-Y11_RS09305,22.218682,26.348167,25.818414,22.313365,26.558708,26.574064,27.109287,27.160953,29.250470,31.808615,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,530.454100,419.675356,405.395800,419.248909,343.453782,294.619921,316.610501,289.561840,300.310108,279.763430,host,pfkA
gene-Y11_RS15790,165.960501,107.248459,111.654558,85.842894,72.815990,62.999433,66.585457,58.935380,61.342959,55.116426,host,ilvM
gene-Y11_RS19470,149.936704,119.917373,103.082532,97.990142,81.782509,81.484254,94.646049,87.034409,94.356169,93.837053,host,gene-Y11_RS19470
gene-Y11_RS12990,18.414002,9.022674,8.951267,16.598738,22.353379,25.414348,31.512608,36.423573,24.923264,35.706048,host,gene-Y11_RS12990


In [21]:
# Check gene names unique
len(tpms['Symbol'].unique())

4353

Log2+1 normalization.

In [22]:
logs = logNorm(df_norRNAs)
logs['Entity'] = gff3.loc[sorted(logs.index.to_list()), 'Entity']
logs['Symbol'] = gff3.loc[sorted(logs.index.to_list()), 'Symbol']
logs = fillSymbols(logs)
# Make gene names unique
logs = make_unique_with_index(logs)
logs

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,8.629357,6.741467,7.294621,8.507795,8.049849,8.308339,8.717676,8.918863,9.030667,8.487840,host,gene-Y11_RS20515
gene-Y11_RS20385,12.500593,11.915132,12.005274,12.443203,12.514714,12.040290,11.927408,11.882643,11.417853,11.344296,host,sseB
gene-phiR1-37_gp074,2.807355,10.189825,10.941048,12.325868,12.610102,12.753008,13.266054,13.491853,13.408728,13.607677,phage,g074
gene-Y11_RS01175,12.102960,11.067434,11.789941,11.689561,11.608717,11.269711,10.984418,11.006326,10.915879,10.675957,host,katA
gene-Y11_RS09305,9.906891,9.997179,10.188589,10.364135,10.804938,10.417853,10.375039,10.456354,10.189825,10.232421,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,15.001892,14.508352,14.679205,15.113783,15.015763,14.406471,14.438727,14.388488,14.067518,13.886935,host,pfkA
gene-Y11_RS15790,11.394463,10.609179,10.887982,10.894818,10.847057,10.250298,10.258566,10.161132,9.845490,9.612868,host,ilvM
gene-Y11_RS19470,11.616089,11.138272,11.140830,11.453785,11.382624,10.989394,11.133784,11.091435,10.834471,10.748193,host,gene-Y11_RS19470
gene-Y11_RS12990,8.566054,7.383704,7.592457,8.867279,9.485829,9.283088,9.521600,9.808964,8.888743,9.328675,host,gene-Y11_RS12990


In [23]:
logTPMs = logNorm(tpms.iloc[:, :-2])
logTPMs = logTPMs.join(tpms.iloc[:, -2:])
logTPMs = fillSymbols(logTPMs)
# Make gene names unique
logTPMs = make_unique_with_index(logTPMs)
logTPMs

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,3.596541,2.113833,2.377663,3.080522,2.525823,3.079308,3.511537,3.623990,4.075635,3.642709,host,gene-Y11_RS20515
gene-Y11_RS20385,6.711612,6.286455,6.157559,6.208711,6.092223,6.007098,5.966451,5.845141,5.755476,5.760161,host,sseB
gene-phiR1-37_gp074,0.273695,5.419358,5.939619,6.926765,7.020601,7.548136,8.130240,8.276952,8.566534,8.843170,phage,g074
gene-Y11_RS01175,5.632060,4.776203,5.262491,4.790982,4.531163,4.578056,4.373911,4.320011,4.590175,4.435612,host,katA
gene-Y11_RS09305,4.537214,4.773372,4.745152,4.543085,4.784436,4.785240,4.812975,4.815624,4.918886,5.036003,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,9.053801,8.716563,8.666742,8.715100,8.428167,8.207600,8.311115,8.182701,8.235105,8.133211,host,pfkA
gene-Y11_RS15790,7.383363,6.758203,6.815762,6.440336,6.205861,5.999987,6.078641,5.905336,5.962155,5.810351,host,ilvM
gene-Y11_RS19470,7.237800,6.917878,6.701584,6.629213,6.371254,6.366047,6.579633,6.459996,6.575254,6.567379,host,gene-Y11_RS19470
gene-Y11_RS12990,4.279026,3.325196,3.314880,4.137400,4.545559,4.723250,5.022927,5.225875,4.696176,5.197946,host,gene-Y11_RS12990


## 3 Filter samples, if necessary

Outlier detection by PCA not required, since no replicates were used.

## 4. Final grouping

No replicates -> No means, standard deviations. But still done.

In [24]:
columnOrder = ['0_R1', '2_R1', '5_R1', '10_R1', '15_R1', '21_R1', '28_R1', '35_R1', '42_R1', '49_R1']

In [25]:
tpms

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,11.096691,3.328399,4.196941,7.459205,4.759018,7.452089,10.404549,11.329048,15.861196,11.490066,host,gene-Y11_RS20515
gene-Y11_RS20385,103.808535,77.056956,70.385506,72.961950,67.224717,63.315671,61.528884,56.486089,53.022042,53.197750,host,sseB
gene-phiR1-37_gp074,0.208900,41.794628,60.376704,120.664507,128.840907,186.160952,279.185832,309.177891,378.126221,458.260882,phage,g074
gene-Y11_RS01175,48.592827,26.401877,37.385540,26.684036,22.121506,22.885374,19.733770,18.973446,23.086862,20.639752,host,katA
gene-Y11_RS09305,22.218682,26.348167,25.818414,22.313365,26.558708,26.574064,27.109287,27.160953,29.250470,31.808615,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,530.454100,419.675356,405.395800,419.248909,343.453782,294.619921,316.610501,289.561840,300.310108,279.763430,host,pfkA
gene-Y11_RS15790,165.960501,107.248459,111.654558,85.842894,72.815990,62.999433,66.585457,58.935380,61.342959,55.116426,host,ilvM
gene-Y11_RS19470,149.936704,119.917373,103.082532,97.990142,81.782509,81.484254,94.646049,87.034409,94.356169,93.837053,host,gene-Y11_RS19470
gene-Y11_RS12990,18.414002,9.022674,8.951267,16.598738,22.353379,25.414348,31.512608,36.423573,24.923264,35.706048,host,gene-Y11_RS12990


In [26]:
TPMmeans, TPMsds = getMeanSD(tpms[columnOrder])
TPMmeans = TPMmeans[['0', '2', '5', '10', '15', '21', '28', '35', '42', '49']]
TPMmeans[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
TPMmeans

Unnamed: 0_level_0,0,2,5,10,15,21,28,35,42,49,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,11.096691,3.328399,4.196941,7.459205,4.759018,7.452089,10.404549,11.329048,15.861196,11.490066,host,gene-Y11_RS20515
gene-Y11_RS20385,103.808535,77.056956,70.385506,72.961950,67.224717,63.315671,61.528884,56.486089,53.022042,53.197750,host,sseB
gene-phiR1-37_gp074,0.208900,41.794628,60.376704,120.664507,128.840907,186.160952,279.185832,309.177891,378.126221,458.260882,phage,g074
gene-Y11_RS01175,48.592827,26.401877,37.385540,26.684036,22.121506,22.885374,19.733770,18.973446,23.086862,20.639752,host,katA
gene-Y11_RS09305,22.218682,26.348167,25.818414,22.313365,26.558708,26.574064,27.109287,27.160953,29.250470,31.808615,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,530.454100,419.675356,405.395800,419.248909,343.453782,294.619921,316.610501,289.561840,300.310108,279.763430,host,pfkA
gene-Y11_RS15790,165.960501,107.248459,111.654558,85.842894,72.815990,62.999433,66.585457,58.935380,61.342959,55.116426,host,ilvM
gene-Y11_RS19470,149.936704,119.917373,103.082532,97.990142,81.782509,81.484254,94.646049,87.034409,94.356169,93.837053,host,gene-Y11_RS19470
gene-Y11_RS12990,18.414002,9.022674,8.951267,16.598738,22.353379,25.414348,31.512608,36.423573,24.923264,35.706048,host,gene-Y11_RS12990


In [27]:
TPMsds = TPMsds[['0', '2', '5', '10', '15', '21', '28', '35', '42', '49']]
TPMsds[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
TPMsds

Unnamed: 0_level_0,0,2,5,10,15,21,28,35,42,49,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,gene-Y11_RS20515
gene-Y11_RS20385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,sseB
gene-phiR1-37_gp074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,phage,g074
gene-Y11_RS01175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,katA
gene-Y11_RS09305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,pfkA
gene-Y11_RS15790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,ilvM
gene-Y11_RS19470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,gene-Y11_RS19470
gene-Y11_RS12990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,host,gene-Y11_RS12990


In [28]:
propExp = proportionalExp(TPMmeans[['0', '2', '5', '10', '15', '21', '28', '35', '42', '49']])
propExp[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
propExp

Unnamed: 0_level_0,0,2,5,10,15,21,28,35,42,49,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene-Y11_RS20515,0.699612,0.209845,0.264604,0.470280,0.300042,0.469831,0.655975,0.714262,1.000000,0.724414,host,gene-Y11_RS20515
gene-Y11_RS20385,1.000000,0.742299,0.678032,0.702851,0.647584,0.609927,0.592715,0.544137,0.510768,0.512460,host,sseB
gene-phiR1-37_gp074,0.000456,0.091203,0.131752,0.263310,0.281152,0.406234,0.609229,0.674677,0.825133,1.000000,phage,g074
gene-Y11_RS01175,1.000000,0.543329,0.769363,0.549135,0.455242,0.470962,0.406105,0.390458,0.475108,0.424749,host,katA
gene-Y11_RS09305,0.698511,0.828334,0.811680,0.701488,0.834953,0.835436,0.852262,0.853887,0.919577,1.000000,host,ybbA
...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,1.000000,0.791162,0.764243,0.790359,0.647471,0.555411,0.596867,0.545875,0.566138,0.527404,host,pfkA
gene-Y11_RS15790,1.000000,0.646229,0.672778,0.517249,0.438755,0.379605,0.401213,0.355117,0.369624,0.332106,host,ilvM
gene-Y11_RS19470,1.000000,0.799787,0.687507,0.653543,0.545447,0.543458,0.631240,0.580474,0.629307,0.625844,host,gene-Y11_RS19470
gene-Y11_RS12990,0.505552,0.247715,0.245755,0.455714,0.613706,0.697745,0.865171,1.000000,0.684262,0.980301,host,gene-Y11_RS12990


## 5. Phage gene classification

Leskinen et al. classified early, middle, late phage genes based on 2-5, 10-21, 28-49 min time intervals.

In [29]:
# Add a classification label based on exceeding 20 % of maximal expression

def classLabelThreshold(tpm):
    
    labels = list()
    
    i = 0
    while i < tpm.shape[0]:

        # Get array of expression values at time points
        expressions = list(tpm.iloc[i,0:(tpm.shape[1]-2)])

        # Get maximal value for each gene across time points
        maxTPM = max(expressions)

        # Get the threshold value
        thresHold = maxTPM*0.2

        # Subset expressions based on threshold
        filteredExpressions = [x for x in expressions if x >= thresHold]

        # Get index of time point
        indices = [expressions.index(x) for x in filteredExpressions]
        timePoint = min(indices)

        if timePoint == 0:
            labels.append('None')
        elif timePoint == 1:
            labels.append('early')
        elif timePoint == 2:
            labels.append('early')
        elif timePoint == 3:
            labels.append('middle')
        elif timePoint == 4:
            labels.append('middle')
        elif timePoint == 5:
            labels.append('middle')
        elif timePoint == 6:
            labels.append('late')
        elif timePoint == 7:
            labels.append('late')
        elif timePoint == 8:
            labels.append('late')
        elif timePoint == 9:
            labels.append('late')

        i += 1

    tpmOut = tpm.copy()
    tpmOut['ClassThreshold'] = labels
    tpmOut.loc[tpmOut['Entity'] == 'host', 'ClassThreshold'] = 'None'

    return tpmOut

In [30]:
# # Export fractional expressions with T=0.6 + Host/Phage Annotation for Orthofinder Analysis
# t = classLabelThreshold(propExp)
# t = t[t["Entity"] == "phage"]
# t["Host"] = "Yersinia_enterocolitica_Y11"
# t["Phage"] = "Yersinia_phage_phiR1-37"
# t.to_csv("/ceph/ibmi/studenten/waffen/Cross_Analysis/Orthofinder_Clustering/Fractional_Expressions/T060/Leskinen_fractional_expression_T=0.6.tsv", sep="\t")

In [31]:
# Add a classification label based on exceeding 20 % of maximal expression

def classLabelMax(tpm):
    
    labels = list()
    
    i = 0
    while i < tpm.shape[0]:

        # Get array of expression values at time points
        expressions = list(tpm.iloc[i,0:(tpm.shape[1]-3)])

        # Get maximal value for each gene across time points
        maxTPM = max(expressions)

        # Get the threshold value
        thresHold = maxTPM

        # Subset expressions based on threshold
        filteredExpressions = [x for x in expressions if x == thresHold]

        # Get index of time point
        indices = [expressions.index(x) for x in filteredExpressions]
        timePoint = min(indices)

        if timePoint == 0:
            labels.append('None')
        elif timePoint == 1:
            labels.append('early')
        elif timePoint == 2:
            labels.append('early')
        elif timePoint == 3:
            labels.append('middle')
        elif timePoint == 4:
            labels.append('middle')
        elif timePoint == 5:
            labels.append('middle')
        elif timePoint == 6:
            labels.append('late')
        elif timePoint == 7:
            labels.append('late')
        elif timePoint == 8:
            labels.append('late')
        elif timePoint == 9:
            labels.append('late')

        i += 1

    tpmOut = tpm.copy()
    tpmOut['ClassMax'] = labels
    tpmOut.loc[tpmOut['Entity'] == 'host', 'ClassMax'] = 'None'

    return tpmOut

In [32]:
TPMmeans = classLabelThreshold(TPMmeans)
TPMmeans = classLabelMax(TPMmeans)

In [33]:
TPMmeans

Unnamed: 0_level_0,0,2,5,10,15,21,28,35,42,49,Entity,Symbol,ClassThreshold,ClassMax
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
gene-Y11_RS20515,11.096691,3.328399,4.196941,7.459205,4.759018,7.452089,10.404549,11.329048,15.861196,11.490066,host,gene-Y11_RS20515,,
gene-Y11_RS20385,103.808535,77.056956,70.385506,72.961950,67.224717,63.315671,61.528884,56.486089,53.022042,53.197750,host,sseB,,
gene-phiR1-37_gp074,0.208900,41.794628,60.376704,120.664507,128.840907,186.160952,279.185832,309.177891,378.126221,458.260882,phage,g074,middle,late
gene-Y11_RS01175,48.592827,26.401877,37.385540,26.684036,22.121506,22.885374,19.733770,18.973446,23.086862,20.639752,host,katA,,
gene-Y11_RS09305,22.218682,26.348167,25.818414,22.313365,26.558708,26.574064,27.109287,27.160953,29.250470,31.808615,host,ybbA,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,530.454100,419.675356,405.395800,419.248909,343.453782,294.619921,316.610501,289.561840,300.310108,279.763430,host,pfkA,,
gene-Y11_RS15790,165.960501,107.248459,111.654558,85.842894,72.815990,62.999433,66.585457,58.935380,61.342959,55.116426,host,ilvM,,
gene-Y11_RS19470,149.936704,119.917373,103.082532,97.990142,81.782509,81.484254,94.646049,87.034409,94.356169,93.837053,host,gene-Y11_RS19470,,
gene-Y11_RS12990,18.414002,9.022674,8.951267,16.598738,22.353379,25.414348,31.512608,36.423573,24.923264,35.706048,host,gene-Y11_RS12990,,


As Leskinen paper already stated, classification into early, middle, late not as straight forward ("Yersinia enterocolitica. RNA sequencing reveals that the gene expression of φR1-37 does not follow a pattern typical observed in other lytic bacteriophages, as only selected genes could be classified as typically early, middle or late genes. The majority of the genes appear to be expressed constitutively throughout infection").

Add classes to other dfs.

In [34]:
gff3_genes["seq_id"].unique()

array(['NC_017564.1', 'NC_017565.1', 'NC_016163.1'], dtype=object)

In [35]:
TPMsds[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
tpms[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
logs[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
propExp[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]

# gff3
gff3_final = gff3_genes.merge(TPMmeans[["ClassThreshold", "ClassMax"]], right_index=True, left_on="ID", how="left")
gff3_host  = gff3_final[(gff3_final['seq_id'] == 'NC_017564.1') | (gff3_final['seq_id'] == 'NC_017565.1')]
gff3_phage = gff3_final[gff3_final['seq_id'] == 'NC_016163.1']

## 6. Add variance to all dataframes

Base variance call on tpms.

In [36]:
def stabilizedVariance(df):
    labels = list()
    
    i = 0
    while i < df.shape[0]:

        # Get array of expression values at time points
        expressions = list(df.iloc[i,0:(df.shape[1]-4)])

        # Get mean expression for the gene
        exprMean = np.mean(np.array(expressions))

        # Get the variance for the gene
        varGene = np.var(np.array(expressions))

        # Stabilized variance
        stableVarGene = varGene/exprMean

        labels.append(stableVarGene)

        i += 1

    tpmOut = df.copy()
    tpmOut['Variance'] = labels

    return tpmOut

In [37]:
tpms = stabilizedVariance(tpms)
tpms

SampleNames,0_R1,2_R1,5_R1,10_R1,15_R1,21_R1,28_R1,35_R1,42_R1,49_R1,Entity,Symbol,ClassThreshold,ClassMax,Variance
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
gene-Y11_RS20515,11.096691,3.328399,4.196941,7.459205,4.759018,7.452089,10.404549,11.329048,15.861196,11.490066,host,gene-Y11_RS20515,,,1.629422
gene-Y11_RS20385,103.808535,77.056956,70.385506,72.961950,67.224717,63.315671,61.528884,56.486089,53.022042,53.197750,host,sseB,,,2.996995
gene-phiR1-37_gp074,0.208900,41.794628,60.376704,120.664507,128.840907,186.160952,279.185832,309.177891,378.126221,458.260882,phage,g074,middle,late,108.247942
gene-Y11_RS01175,48.592827,26.401877,37.385540,26.684036,22.121506,22.885374,19.733770,18.973446,23.086862,20.639752,host,katA,,,2.953283
gene-Y11_RS09305,22.218682,26.348167,25.818414,22.313365,26.558708,26.574064,27.109287,27.160953,29.250470,31.808615,host,ybbA,,,0.274951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gene-Y11_RS13415,530.454100,419.675356,405.395800,419.248909,343.453782,294.619921,316.610501,289.561840,300.310108,279.763430,host,pfkA,,,16.554218
gene-Y11_RS15790,165.960501,107.248459,111.654558,85.842894,72.815990,62.999433,66.585457,58.935380,61.342959,55.116426,host,ilvM,,,12.803937
gene-Y11_RS19470,149.936704,119.917373,103.082532,97.990142,81.782509,81.484254,94.646049,87.034409,94.356169,93.837053,host,gene-Y11_RS19470,,,3.828010
gene-Y11_RS12990,18.414002,9.022674,8.951267,16.598738,22.353379,25.414348,31.512608,36.423573,24.923264,35.706048,host,gene-Y11_RS12990,,,3.831941


In [38]:
logs['Variance'] = tpms['Variance']
TPMmeans['Variance'] = tpms['Variance']
TPMsds['Variance'] = tpms['Variance']
propExp['Variance'] = tpms['Variance']

## 7. Write data to output

In [39]:
df_norRNAs[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]

In [40]:
# Full TPM table
tpms.to_csv('Leskinen_full_TPM.tsv', sep = '\t')
# Full raw counts table
df_norRNAs.to_csv('Leskinen_full_raw_counts.tsv', sep = '\t')
# Summarized (time point means) TPM table
TPMmeans.to_csv('Leskinen_TPM_means.tsv', sep = '\t')
# Summarized (time point) TPM standard deviation
TPMsds.to_csv('Leskinen_TPM_std.tsv', sep = '\t')
# Proportional expression per gene and time point
propExp.to_csv('Leskinen_fractional_expression.tsv', sep = '\t')
# Processed gff3 file
gff3_host.to_csv('Leskinen_host_gff3.tsv', sep='\t')
gff3_phage.to_csv('Leskinen_phage_gff3.tsv', sep='\t')