# Process bulk counts table

1. Annotate columns (samples) in a way that time point and replicate are annotated
2. Remove rRNA genes
3. Perform TPM normalization and alternatively also raw counts

In [1]:
# Use miniconda environment Jupyter_new for running this notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from tools import *
import warnings
warnings.filterwarnings('ignore')

## 1 Load the dataset and annotation

In [2]:
bulkPath = '../nf_output/countData/countData.tsv'
metaPath = '../SraRunTable.csv' # metadata from SRA
gffPath = '../nf_output/alignments/dualGenome.gff3' # output from nf pipeline

In [3]:
# Load and filter data
df_initial = pd.read_csv(bulkPath, sep = '\t', comment='#', index_col=0)
df_initial.drop(columns=['SRR11805720_sorted.bam', 'SRR11805721_sorted.bam'], inplace=True)
metadata = pd.read_csv(metaPath)
metadata = metadata[metadata['infection'] != 'uninfected control']

In [4]:
df_initial.columns

Index(['Chr', 'Start', 'End', 'Strand', 'Length', 'SRR11805719_sorted.bam',
       'SRR11805717_sorted.bam', 'SRR11805716_sorted.bam',
       'SRR11805718_sorted.bam', 'SRR11805715_sorted.bam'],
      dtype='object')

In [5]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 20)
metadata

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession (exp),infection,Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,time,HOST
0,SRR11805715,RNA-Seq,300,3432119400,PRJNA633474,SAMN14944071,1427216801,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357147,GSM4557357,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:30:00Z,1,GSM4557357,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,15 min,WH7803
1,SRR11805716,RNA-Seq,300,2546526900,PRJNA633474,SAMN14944078,1065080875,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357148,GSM4557358,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:28:00Z,1,GSM4557358,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,1 h,WH7803
2,SRR11805717,RNA-Seq,300,2423201400,PRJNA633474,SAMN14944077,1000411109,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357149,GSM4557359,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:27:00Z,1,GSM4557359,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,3 h,WH7803
3,SRR11805718,RNA-Seq,300,2709969300,PRJNA633474,SAMN14944076,1130072843,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357150,GSM4557360,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:29:00Z,1,GSM4557360,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,5 h,WH7803
4,SRR11805719,RNA-Seq,300,2005603200,PRJNA633474,SAMN14944075,843155373,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357151,GSM4557361,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:27:00Z,1,GSM4557361,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,7 h,WH7803


## 2 Format the dataset

### 2.1 Annotate sample names

Issue now is that the metadata does not properly annotate sample names. Thus, this will be done manually by adding another sample name column to the metadata.

In [6]:
# Match GSM IDs and SampleNames inferred from GEO

sampleDict = {'GSM4557357': '15_R1',  'GSM4557358': '60_R1', 'GSM4557359': '180_R1',
              'GSM4557360': '300_R1', 'GSM4557361': '420_R1'
}

In [7]:
metadataFull = annotateData(metadata, sampleDict)

In [8]:
metadata.head(1)

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession (exp),infection,Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,time,HOST
0,SRR11805715,RNA-Seq,300,3432119400,PRJNA633474,SAMN14944071,1427216801,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357147,GSM4557357,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:30:00Z,1,GSM4557357,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,15 min,WH7803


In [9]:
metadataFull.head()

Unnamed: 0_level_0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession (exp),infection,Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,time,HOST,SampleID,SampleNames
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
SRR11805715_sorted.bam,SRR11805715,RNA-Seq,300,3432119400,PRJNA633474,SAMN14944071,1427216801,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357147,GSM4557357,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:30:00Z,1,GSM4557357,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,15 min,WH7803,SRR11805715_sorted.bam,15_R1
SRR11805716_sorted.bam,SRR11805716,RNA-Seq,300,2546526900,PRJNA633474,SAMN14944078,1065080875,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357148,GSM4557358,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:28:00Z,1,GSM4557358,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,1 h,WH7803,SRR11805716_sorted.bam,60_R1
SRR11805717_sorted.bam,SRR11805717,RNA-Seq,300,2423201400,PRJNA633474,SAMN14944077,1000411109,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357149,GSM4557359,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:27:00Z,1,GSM4557359,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,3 h,WH7803,SRR11805717_sorted.bam,180_R1
SRR11805718_sorted.bam,SRR11805718,RNA-Seq,300,2709969300,PRJNA633474,SAMN14944076,1130072843,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357150,GSM4557360,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:29:00Z,1,GSM4557360,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,5 h,WH7803,SRR11805718_sorted.bam,300_R1
SRR11805719_sorted.bam,SRR11805719,RNA-Seq,300,2005603200,PRJNA633474,SAMN14944075,843155373,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX8357151,GSM4557361,phage S-SBP1,Illumina HiSeq 4000,PAIRED,cDNA,TRANSCRIPTOMIC,Synechococcus sp. WH 7803,ILLUMINA,2020-12-07T00:00:00Z,2020-05-18T08:27:00Z,1,GSM4557361,Synechococcus sp. strain WH7803 infected by S-...,SRP262107,7 h,WH7803,SRR11805719_sorted.bam,420_R1


Add correct sample names.

In [10]:
df = changeColnames(df_initial.iloc[:,5:df_initial.shape[1]], metadataFull)
df = df[['15_R1', '60_R1', '180_R1', '300_R1', '420_R1']]
df.head()

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gene-SynWH7803_0001,1296,452,610,445,254
gene-SynWH7803_0002,358,140,193,171,82
gene-SynWH7803_0003,1321,345,759,316,183
gene-SynWH7803_0004,1308,472,650,371,170
gene-SynWH7803_0005,1571,613,638,524,258


### 2.2 Remove rRNA genes

Consult gff3 file to get rRNA geneids.

In [11]:
# Load gff3 and split into genes and CDS dfs
gff3 = pd.read_csv(gffPath, sep='\t', header = None, skiprows = 5)
gff3.columns=["seq_id", "source", "type", "start", "end", "phase", "strand", "score", "attributes"]
gff3_genes = gff3.loc[gff3["type"] == 'gene']

# Column formating for genes
gff3_genes = gff3_genes.reset_index(drop=True)
dct_genes = gff3_genes["attributes"].str.split(';').apply(lambda items: dict(item.split('=', 1) for item in items if '=' in item))
cols_to_keep = ['ID', 'Name', 'gbkey', 'gene_biotype', 'locus_tag', 'gene']
gff3_genes = pd.concat([gff3_genes, pd.json_normalize(dct_genes)[cols_to_keep]], axis=1)

# Generate locus_tag, product dictonary over all different feature types
attrs = gff3["attributes"].str.split(";", expand=True)
attrs_dicts = attrs.apply(lambda row: {item.split("=")[0]: item.split("=")[1] for item in row if "=" in str(item)}, axis=1)
attrs_df = pd.json_normalize(attrs_dicts)
attrs_df = attrs_df.dropna(subset=["locus_tag", "product"])
locus_product_dict = dict(zip(attrs_df["locus_tag"], attrs_df["product"]))

# Add gene product, if not stated in gff3, fill with gene_biotype
gff3_genes["product"] = gff3_genes["locus_tag"].map(locus_product_dict)
# gff3_genes["product"] = gff3_genes["product"].fillna("other")
gff3_genes["product"] = gff3_genes["product"].fillna(gff3_genes["gene_biotype"])

# If gene = NA, take from ID column
gff3_genes["gene"] = gff3_genes["gene"].fillna(gff3_genes["ID"])

# Drop attributes column
gff3_genes = gff3_genes.drop(["attributes"], axis=1)

In [12]:
pharokka_path = "../../../2025-12_reannotation_phage_genomes/Pharokka_proteins_phages_out/MT424636.1_out/pharokka_proteins_full_merged_output.tsv"
gff3_genes = add_pharokka(gff3_genes, pharokka_path)
gff3_genes.loc[gff3_genes['seq_id'] == "MT424636.1"]

Unnamed: 0,seq_id,source,type,start,end,phase,strand,score,ID,Name,gbkey,gene_biotype,locus_tag,gene,product,annot,PHROG,category
2586,MT424636.1,Genbank,gene,649.0,924.0,.,+,.,gene-SSBP1_gp01,SSBP1_gp01,Gene,protein_coding,SSBP1_gp01,gene-SSBP1_gp01,hypothetical protein,hypothetical protein,7365,unknown function
2587,MT424636.1,Genbank,gene,1019.0,1330.0,.,+,.,gene-SSBP1_gp02,SSBP1_gp02,Gene,protein_coding,SSBP1_gp02,gene-SSBP1_gp02,hypothetical protein,hypothetical protein,2543,unknown function
2588,MT424636.1,Genbank,gene,1405.0,1566.0,.,+,.,gene-SSBP1_gp03,SSBP1_gp03,Gene,protein_coding,SSBP1_gp03,gene-SSBP1_gp03,hypothetical protein,hypothetical protein,No_PHROG,unknown function
2589,MT424636.1,Genbank,gene,1566.0,1910.0,.,+,.,gene-SSBP1_gp04,SSBP1_gp04,Gene,protein_coding,SSBP1_gp04,gene-SSBP1_gp04,hypothetical protein,hypothetical protein,554,unknown function
2590,MT424636.1,Genbank,gene,1907.0,2095.0,.,+,.,gene-SSBP1_gp05,SSBP1_gp05,Gene,protein_coding,SSBP1_gp05,gene-SSBP1_gp05,hypothetical protein,hypothetical protein,No_PHROG,unknown function
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2636,MT424636.1,Genbank,gene,41158.0,41403.0,.,+,.,gene-SSBP1_gp51,SSBP1_gp51,Gene,protein_coding,SSBP1_gp51,gene-SSBP1_gp51,hypothetical protein,terminase small subunit,891,head and packaging
2637,MT424636.1,Genbank,gene,41390.0,41647.0,.,+,.,gene-SSBP1_gp52,SSBP1_gp52,Gene,protein_coding,SSBP1_gp52,gene-SSBP1_gp52,hypothetical protein,hypothetical protein,6437,unknown function
2638,MT424636.1,Genbank,gene,41974.0,42744.0,.,+,.,gene-SSBP1_gp53,SSBP1_gp53,Gene,protein_coding,SSBP1_gp53,gene-SSBP1_gp53,hypothetical protein,hypothetical protein,No_PHROG,unknown function
2639,MT424636.1,Genbank,gene,42741.0,44471.0,.,+,.,gene-SSBP1_gp54,SSBP1_gp54,Gene,protein_coding,SSBP1_gp54,gene-SSBP1_gp54,terminase large subunit,terminase large subunit,17672,head and packaging


In [13]:
# Load ggf3 file

gff3 = pd.read_csv(gffPath, sep='\t', header = None, skiprows = 5)
gff3 = gff3.loc[gff3.iloc[:,2] == 'gene']

# Format some new columns
gff3['ID'] = pd.DataFrame(gff3.iloc[:,8].str.split('ID=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]
gff3['GeneType'] = pd.DataFrame(gff3.iloc[:,8].str.split('gene_biotype=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]
gff3['Symbol'] = pd.DataFrame(gff3.iloc[:,8].str.split('gene=', expand = True)).iloc[:,1].str.split(';', expand = True).iloc[:,0]

# Add entity host and phage
gff3['Entity'] = np.where(gff3[0] == 'CT971583.1', 'host', 'phage')
gff3.index = gff3['ID']
rRNAs = gff3.loc[gff3['GeneType'] == 'rRNA', 'ID'].tolist()

In [14]:
gff3

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,ID,GeneType,Symbol,Entity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
gene-SynWH7803_0001,CT971583.1,EMBL,gene,174.0,1343.0,.,+,.,ID=gene-SynWH7803_0001;Name=dnaN;gbkey=Gene;ge...,gene-SynWH7803_0001,protein_coding,dnaN,host
gene-SynWH7803_0002,CT971583.1,EMBL,gene,1347.0,2096.0,.,+,.,ID=gene-SynWH7803_0002;Name=SynWH7803_0002;gbk...,gene-SynWH7803_0002,protein_coding,,host
gene-SynWH7803_0003,CT971583.1,EMBL,gene,2187.0,4484.0,.,+,.,ID=gene-SynWH7803_0003;Name=purL;gbkey=Gene;ge...,gene-SynWH7803_0003,protein_coding,purL,host
gene-SynWH7803_0004,CT971583.1,EMBL,gene,4523.0,5989.0,.,+,.,ID=gene-SynWH7803_0004;Name=purF;gbkey=Gene;ge...,gene-SynWH7803_0004,protein_coding,purF,host
gene-SynWH7803_0005,CT971583.1,EMBL,gene,5992.0,8457.0,.,-,.,ID=gene-SynWH7803_0005;Name=gyrA;gbkey=Gene;ge...,gene-SynWH7803_0005,protein_coding,gyrA,host
...,...,...,...,...,...,...,...,...,...,...,...,...,...
gene-SSBP1_gp51,MT424636.1,Genbank,gene,41158.0,41403.0,.,+,.,ID=gene-SSBP1_gp51;Name=SSBP1_gp51;gbkey=Gene;...,gene-SSBP1_gp51,protein_coding,,phage
gene-SSBP1_gp52,MT424636.1,Genbank,gene,41390.0,41647.0,.,+,.,ID=gene-SSBP1_gp52;Name=SSBP1_gp52;gbkey=Gene;...,gene-SSBP1_gp52,protein_coding,,phage
gene-SSBP1_gp53,MT424636.1,Genbank,gene,41974.0,42744.0,.,+,.,ID=gene-SSBP1_gp53;Name=SSBP1_gp53;gbkey=Gene;...,gene-SSBP1_gp53,protein_coding,,phage
gene-SSBP1_gp54,MT424636.1,Genbank,gene,42741.0,44471.0,.,+,.,ID=gene-SSBP1_gp54;Name=SSBP1_gp54;gbkey=Gene;...,gene-SSBP1_gp54,protein_coding,,phage


Perform in silico rRNA depletion.

In [15]:
df_norRNAs = rRNAdepletion(df,rRNAs)
df_norRNAs.head()

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gene-SynWH7803_1101,3221,705,1057,720,351
gene-SynWH7803_0607,478,122,287,157,111
gene-SynWH7803_1992,241,125,111,156,87
gene-SynWH7803_1479,148,123,127,198,94
gene-SynWH7803_0059,2015,712,1124,374,146


### 2.3 Read count normalization

Important note: gene symbols not available for most genes.

In [16]:
# Function to fill in missing symbols by geneid.

def fillSymbols(df):
    df_new = df.copy()
    index = df.index.to_list()
    for i in range(0,df.shape[0]):
        if (df.iloc[i,-1:].values == None):
            df_new.iloc[i,-1:] = index[i]
    return df_new

Convert counts to TPM.

In [17]:
tpms = TPM(df_norRNAs, df_initial, 0.5)
tpms['Entity'] = gff3.loc[sorted(tpms.index.to_list()), 'Entity']
tpms['Symbol'] = gff3.loc[sorted(tpms.index.to_list()), 'Symbol']

tpms = fillSymbols(tpms)
tpms = make_unique_with_index(tpms)
tpms

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,303.932493,227.370819,197.506038,68.764616,32.391487,host,tldD
gene-SynWH7803_0607,127.996783,111.936549,152.242521,42.619634,29.132552,host,gene-SynWH7803_0607
gene-SynWH7803_1992,38.943418,69.132039,35.593672,25.529559,13.781977,host,gene-SynWH7803_1992
gene-SynWH7803_1479,122.780614,348.810070,208.686587,166.025921,76.317074,host,gene-SynWH7803_1479
gene-SynWH7803_0059,1388.688701,1676.971492,1533.778213,261.027319,98.593045,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,111.345738,151.743438,121.135499,48.961116,18.170732,host,csoS3
gene-SynWH7803_2080,134.293957,177.739805,142.787172,45.743705,25.702171,host,gene-SynWH7803_2080
gene-SynWH7803_1212,115.572453,174.014430,142.406612,52.021574,21.783400,host,gene-SynWH7803_1212
gene-SynWH7803_0726,30.737259,35.958453,20.838348,23.256640,16.533123,host,gene-SynWH7803_0726


In [18]:
# Check gene names unique
len(tpms['Symbol'].unique())

2635

Log2+1 normalization raw counts

In [19]:
logs = logNorm(df_norRNAs)
logs['Entity'] = gff3.loc[sorted(logs.index.to_list()), 'Entity']
logs['Symbol'] = gff3.loc[sorted(logs.index.to_list()), 'Symbol']
logs = fillSymbols(logs)
# Make gene names unique
logs = make_unique_with_index(logs)
logs

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,11.653741,9.463524,10.047124,9.493855,8.459432,host,tldD
gene-SynWH7803_0607,8.903882,6.942515,8.169925,7.303781,6.807355,host,gene-SynWH7803_0607
gene-SynWH7803_1992,7.918863,6.977280,6.807355,7.294621,6.459432,host,gene-SynWH7803_1992
gene-SynWH7803_1479,7.219169,6.954196,7.000000,7.636625,6.569856,host,gene-SynWH7803_1479
gene-SynWH7803_0059,10.977280,9.477758,10.135709,8.550747,7.199672,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,10.479780,9.154818,9.616549,9.278449,7.900867,host,csoS3
gene-SynWH7803_2080,9.623881,8.257388,8.727920,8.055282,7.276124,host,gene-SynWH7803_2080
gene-SynWH7803_1212,10.222795,9.041659,9.539159,9.055282,7.851749,host,gene-SynWH7803_1212
gene-SynWH7803_0726,7.515700,5.977280,5.977280,7.098032,6.658211,host,gene-SynWH7803_0726


Log2+1 normalization tpms

In [20]:
logTPMs = logNorm(tpms.iloc[:, :-2])
logTPMs = logTPMs.join(tpms.iloc[:, -2:])
logTPMs = fillSymbols(logTPMs)
# Make gene names unique
logTPMs = make_unique_with_index(logTPMs)
logTPMs

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,8.252346,7.835235,7.633039,6.124424,5.061408,host,tldD
gene-SynWH7803_0607,7.011191,6.819369,7.259673,5.446906,4.913251,host,gene-SynWH7803_0607
gene-SynWH7803_1992,5.319886,6.132002,5.193522,4.729529,3.885767,host,gene-SynWH7803_1992
gene-SynWH7803_1479,6.951642,8.450428,7.712091,7.383928,6.272715,host,gene-SynWH7803_1479
gene-SynWH7803_0059,10.440546,10.712502,10.583814,8.033573,6.637973,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,6.811802,7.254967,6.932339,5.642734,4.260834,host,csoS3
gene-SynWH7803_2080,7.079954,7.481717,7.167791,5.546700,4.738885,host,gene-SynWH7803_2080
gene-SynWH7803_1212,6.865083,7.451330,7.163968,5.728508,4.509911,host,gene-SynWH7803_1212
gene-SynWH7803_0726,4.988106,5.207832,4.448792,4.600308,4.132011,host,gene-SynWH7803_0726


## 3 Filter samples, if necessary

One replicate, not necessary.

## 4. Final grouping

Summarize time points with mean and standard deviation for TPM-normalized data.

In [21]:
columnOrder = ['15_R1', '60_R1', '180_R1', '300_R1', '420_R1']

In [22]:
TPMmeans, TPMsds = getMeanSD(tpms[columnOrder])
TPMmeans = TPMmeans[['15', '60', '180', '300', '420']]
TPMmeans[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
TPMmeans

Unnamed: 0_level_0,15,60,180,300,420,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,303.932493,227.370819,197.506038,68.764616,32.391487,host,tldD
gene-SynWH7803_0607,127.996783,111.936549,152.242521,42.619634,29.132552,host,gene-SynWH7803_0607
gene-SynWH7803_1992,38.943418,69.132039,35.593672,25.529559,13.781977,host,gene-SynWH7803_1992
gene-SynWH7803_1479,122.780614,348.810070,208.686587,166.025921,76.317074,host,gene-SynWH7803_1479
gene-SynWH7803_0059,1388.688701,1676.971492,1533.778213,261.027319,98.593045,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,111.345738,151.743438,121.135499,48.961116,18.170732,host,csoS3
gene-SynWH7803_2080,134.293957,177.739805,142.787172,45.743705,25.702171,host,gene-SynWH7803_2080
gene-SynWH7803_1212,115.572453,174.014430,142.406612,52.021574,21.783400,host,gene-SynWH7803_1212
gene-SynWH7803_0726,30.737259,35.958453,20.838348,23.256640,16.533123,host,gene-SynWH7803_0726


In [23]:
TPMsds = TPMsds[['15', '60', '180', '300', '420']]
TPMsds[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
TPMsds

Unnamed: 0_level_0,15,60,180,300,420,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,0.0,0.0,0.0,0.0,0.0,host,tldD
gene-SynWH7803_0607,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_0607
gene-SynWH7803_1992,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_1992
gene-SynWH7803_1479,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_1479
gene-SynWH7803_0059,0.0,0.0,0.0,0.0,0.0,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,0.0,0.0,0.0,0.0,0.0,host,csoS3
gene-SynWH7803_2080,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_2080
gene-SynWH7803_1212,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_1212
gene-SynWH7803_0726,0.0,0.0,0.0,0.0,0.0,host,gene-SynWH7803_0726


In [24]:
propExp = proportionalExp(TPMmeans[['15', '60', '180', '300', '420']])
propExp[['Entity', 'Symbol']] = TPMmeans[['Entity', 'Symbol']]
propExp

Unnamed: 0_level_0,15,60,180,300,420,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,1.000000,0.748096,0.649835,0.226250,0.106575,host,tldD
gene-SynWH7803_0607,0.840743,0.735252,1.000000,0.279946,0.191356,host,gene-SynWH7803_0607
gene-SynWH7803_1992,0.563319,1.000000,0.514865,0.369287,0.199357,host,gene-SynWH7803_1992
gene-SynWH7803_1479,0.351998,1.000000,0.598281,0.475978,0.218793,host,gene-SynWH7803_1479
gene-SynWH7803_0059,0.828093,1.000000,0.914612,0.155654,0.058792,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,0.733776,1.000000,0.798292,0.322657,0.119746,host,csoS3
gene-SynWH7803_2080,0.755565,1.000000,0.803349,0.257363,0.144606,host,gene-SynWH7803_2080
gene-SynWH7803_1212,0.664154,1.000000,0.818361,0.298950,0.125182,host,gene-SynWH7803_1212
gene-SynWH7803_0726,0.854799,1.000000,0.579512,0.646764,0.459784,host,gene-SynWH7803_0726


## 5. Phage gene classification

In [25]:
# Add a classification label based on exceeding 20 % of maximal expression

def classLabelThreshold(tpm):
    
    labels = list()
    
    i = 0
    while i < tpm.shape[0]:

        # Get array of expression values at time points
        expressions = list(tpm.iloc[i,0:(tpm.shape[1]-2)])

        # Get maximal value for each gene across time points
        maxTPM = max(expressions)

        # Get the threshold value
        thresHold = maxTPM*0.2

        # Subset expressions based on threshold
        filteredExpressions = [x for x in expressions if x >= thresHold]

        # Get index of time point
        indices = [expressions.index(x) for x in filteredExpressions]
        timePoint = min(indices)

        if timePoint == 0:
            labels.append('early')
        elif timePoint == 1:
            labels.append('early')
        elif timePoint == 2:
            labels.append('early')
        elif timePoint == 3:
            labels.append('middle')
        elif timePoint == 4:
            labels.append('late')
        i += 1

    tpmOut = tpm.copy()
    tpmOut['ClassThreshold'] = labels
    tpmOut.loc[tpmOut['Entity'] == 'host', 'ClassThreshold'] = 'None'

    return tpmOut

In [26]:
# Add a classification label based on exceeding 20 % of maximal expression

def classLabelMax(tpm):
    
    labels = list()
    
    i = 0
    while i < tpm.shape[0]:

        # Get array of expression values at time points
        expressions = list(tpm.iloc[i,0:(tpm.shape[1]-3)])

        # Get maximal value for each gene across time points
        maxTPM = max(expressions)

        # Get the threshold value
        thresHold = maxTPM

        # Subset expressions based on threshold
        filteredExpressions = [x for x in expressions if x == thresHold]

        # Get index of time point
        indices = [expressions.index(x) for x in filteredExpressions]
        timePoint = min(indices)

        if timePoint == 0:
            labels.append('early')
        elif timePoint == 1:
            labels.append('early')
        elif timePoint == 2:
            labels.append('early')
        elif timePoint == 3:
            labels.append('middle')
        elif timePoint == 4:
            labels.append('late')

        i += 1

    tpmOut = tpm.copy()
    tpmOut['ClassMax'] = labels
    tpmOut.loc[tpmOut['Entity'] == 'host', 'ClassMax'] = 'None'

    return tpmOut

In [27]:
TPMmeans

Unnamed: 0_level_0,15,60,180,300,420,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,303.932493,227.370819,197.506038,68.764616,32.391487,host,tldD
gene-SynWH7803_0607,127.996783,111.936549,152.242521,42.619634,29.132552,host,gene-SynWH7803_0607
gene-SynWH7803_1992,38.943418,69.132039,35.593672,25.529559,13.781977,host,gene-SynWH7803_1992
gene-SynWH7803_1479,122.780614,348.810070,208.686587,166.025921,76.317074,host,gene-SynWH7803_1479
gene-SynWH7803_0059,1388.688701,1676.971492,1533.778213,261.027319,98.593045,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,111.345738,151.743438,121.135499,48.961116,18.170732,host,csoS3
gene-SynWH7803_2080,134.293957,177.739805,142.787172,45.743705,25.702171,host,gene-SynWH7803_2080
gene-SynWH7803_1212,115.572453,174.014430,142.406612,52.021574,21.783400,host,gene-SynWH7803_1212
gene-SynWH7803_0726,30.737259,35.958453,20.838348,23.256640,16.533123,host,gene-SynWH7803_0726


In [28]:
TPMmeans = classLabelThreshold(TPMmeans)
TPMmeans = classLabelMax(TPMmeans)

In [29]:
pd.set_option('display.max_rows', 20)
TPMmeans[TPMmeans['Entity'] == 'phage'].sort_index()

Unnamed: 0_level_0,15,60,180,300,420,Entity,Symbol,ClassThreshold,ClassMax
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gene-SSBP1_gp01,78.838598,1008.869119,7955.664628,28979.100789,16655.780025,phage,gene-SSBP1_gp01,early,middle
gene-SSBP1_gp02,60.996779,332.339087,2509.786703,6927.082504,3906.377330,phage,gene-SSBP1_gp02,early,middle
gene-SSBP1_gp03,136.001846,884.577945,6659.102594,15050.794148,9630.749613,phage,gene-SSBP1_gp03,early,middle
gene-SSBP1_gp04,329.194399,1545.975842,6416.196310,15109.304283,9417.828885,phage,gene-SSBP1_gp04,early,middle
gene-SSBP1_gp05,520.067208,2385.586513,8367.013624,20939.372862,13468.678450,phage,gene-SSBP1_gp05,early,middle
...,...,...,...,...,...,...,...,...,...
gene-SSBP1_gp51,139.473010,512.434267,1023.721981,8845.595242,14609.785482,phage,gene-SSBP1_gp51,middle,late
gene-SSBP1_gp52,145.676395,427.186422,934.236463,8962.661958,14002.567881,phage,gene-SSBP1_gp52,middle,late
gene-SSBP1_gp53,112.093268,360.547154,696.879775,5291.052077,8472.565273,phage,gene-SSBP1_gp53,middle,late
gene-SSBP1_gp54,81.845861,265.586517,409.309944,3263.401682,5795.693671,phage,gene-SSBP1_gp54,middle,late


In [30]:
TPMmeans[TPMmeans['Entity'] == 'phage']['ClassMax'].value_counts()

ClassMax
late      42
middle    13
Name: count, dtype: int64

Add classes to other dfs.

In [31]:
gff3_genes["seq_id"].unique()

array(['CT971583.1', 'MT424636.1'], dtype=object)

In [32]:
TPMsds[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
tpms[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
logs[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]
propExp[['ClassThreshold', 'ClassMax']] = TPMmeans[['ClassThreshold', 'ClassMax']]

# gff3
gff3_final = gff3_genes.merge(TPMmeans[["ClassThreshold", "ClassMax"]], right_index=True, left_on="ID", how="left")
gff3_host  = gff3_final[gff3_final['seq_id'] == 'CT971583.1']
gff3_phage = gff3_final[gff3_final['seq_id'] == 'MT424636.1']

## 6. Add variance to all dataframes

Base variance call on tpms.

In [33]:
def stabilizedVariance(df):
    labels = list()
    
    i = 0
    while i < df.shape[0]:

        # Get array of expression values at time points
        expressions = list(df.iloc[i,0:(df.shape[1]-4)])

        # Get mean expression for the gene
        exprMean = np.mean(np.array(expressions))

        # Get the variance for the gene
        varGene = np.var(np.array(expressions))

        # Stabilized variance
        stableVarGene = varGene/exprMean

        labels.append(stableVarGene)

        i += 1

    tpmOut = df.copy()
    tpmOut['Variance'] = labels

    return tpmOut

In [34]:
tpms = stabilizedVariance(tpms)
tpms

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1,Entity,Symbol,ClassThreshold,ClassMax,Variance
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
gene-SynWH7803_1101,303.932493,227.370819,197.506038,68.764616,32.391487,host,tldD,,,61.557183
gene-SynWH7803_0607,127.996783,111.936549,152.242521,42.619634,29.132552,host,gene-SynWH7803_0607,,,25.241084
gene-SynWH7803_1992,38.943418,69.132039,35.593672,25.529559,13.781977,host,gene-SynWH7803_1992,,,9.334618
gene-SynWH7803_1479,122.780614,348.810070,208.686587,166.025921,76.317074,host,gene-SynWH7803_1479,,,47.079940
gene-SynWH7803_0059,1388.688701,1676.971492,1533.778213,261.027319,98.593045,host,rpmI,,,454.233047
...,...,...,...,...,...,...,...,...,...,...
gene-SynWH7803_0681,111.345738,151.743438,121.135499,48.961116,18.170732,host,csoS3,,,26.765045
gene-SynWH7803_2080,134.293957,177.739805,142.787172,45.743705,25.702171,host,gene-SynWH7803_2080,,,33.017888
gene-SynWH7803_1212,115.572453,174.014430,142.406612,52.021574,21.783400,host,gene-SynWH7803_1212,,,31.498710
gene-SynWH7803_0726,30.737259,35.958453,20.838348,23.256640,16.533123,host,gene-SynWH7803_0726,,,1.916142


In [35]:
logs['Variance'] = tpms['Variance']
TPMmeans['Variance'] = tpms['Variance']
TPMsds['Variance'] = tpms['Variance']
propExp['Variance'] = tpms['Variance']

## 7. Write data to output

In [36]:
df_norRNAs[['Entity', 'Symbol']] = tpms[['Entity', 'Symbol']]
df_norRNAs

SampleNames,15_R1,60_R1,180_R1,300_R1,420_R1,Entity,Symbol
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gene-SynWH7803_1101,3221,705,1057,720,351,host,tldD
gene-SynWH7803_0607,478,122,287,157,111,host,gene-SynWH7803_0607
gene-SynWH7803_1992,241,125,111,156,87,host,gene-SynWH7803_1992
gene-SynWH7803_1479,148,123,127,198,94,host,gene-SynWH7803_1479
gene-SynWH7803_0059,2015,712,1124,374,146,host,rpmI
...,...,...,...,...,...,...,...
gene-SynWH7803_0681,1427,569,784,620,238,host,csoS3
gene-SynWH7803_2080,788,305,423,265,154,host,gene-SynWH7803_2080
gene-SynWH7803_1212,1194,526,743,531,230,host,gene-SynWH7803_1212
gene-SynWH7803_0726,182,62,62,136,100,host,gene-SynWH7803_0726


In [37]:
# Full TPM table
tpms.to_csv('Huang_infection_full_TPM.tsv', sep = '\t')
# Full raw_counts table
df_norRNAs.to_csv('Huang_infection_full_raw_counts.tsv', sep = '\t')
# Summarized (time point means) TPM table
TPMmeans.to_csv('Huang_infection_TPM_means.tsv', sep = '\t')
# Summarized (time point) TPM standard deviation
TPMsds.to_csv('Huang_infection_TPM_std.tsv', sep = '\t')
# Proportional expression per gene and time point
propExp.to_csv('Huang_infection_fractional_expression.tsv', sep = '\t')
# Processed gff3 file
gff3_host.to_csv('Huang_infection_host_gff3.tsv', sep='\t')
gff3_phage.to_csv('Huang_infection_phage_gff3.tsv', sep='\t')