In [1]:
#imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

#### Read in Kraken reports

In [30]:

def read_kraken2_reports(krakenDir):
    """
    Read and process Kraken2 reports for multiple samples and consolidate the data into a single DataFrame.

    Parameters:
    krakenDir (str): The directory containing Kraken2 report files for multiple samples.

    Returns:
    pd.DataFrame: A DataFrame containing consolidated Kraken2 report data.
    """

    kraken_df = pd.DataFrame()  # Initialize an empty DataFrame to store all samples' Kraken2 report data

    # Iterate through files in the krakenDir
    for file in os.listdir(krakenDir):
        if file.endswith("kraken2_report"):
            sample = file.split(".")[0]  # Extract the sample name from the file name

            # Read Kraken2 report per sample
            sample_kraken_df = pd.read_csv(os.path.join(krakenDir, file), sep="\t", header=None)
            sample_kraken_df.rename(columns={
                0: "coverage",
                1: "clade_fragments",
                2: "taxon_fragments",
                3: "tax_code",
                4: "taxid",
                5: "scname"
            }, inplace=True)

            # Add a 'sample' column for tracking the sample name
            sample_kraken_df['sample'] = sample

            # Update the consolidated DataFrame with sample data
            kraken_df = pd.concat([kraken_df, sample_kraken_df])

    return kraken_df


In [31]:
krakenDir = "/lustre/shared/wfsr-mcfa/projects/internships/luka/viral_metagenomics_pipeline/results/kraken2/results/"

In [32]:
kraken_df = read_kraken2_reports(krakenDir)
kraken_df.head()

Unnamed: 0,coverage,clade_fragments,taxon_fragments,tax_code,taxid,scname,sample
0,32.87,21574288,21574288,U,0,unclassified,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1,67.13,44065934,15973,R,1,root,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
2,65.37,42905852,76410,R1,131567,cellular organisms,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
3,64.4,42270732,135326,D,2759,Eukaryota,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
4,63.7,41814544,0,D1,33154,Opisthokonta,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...


### link kraken viruses with host-db

In [33]:
#load host db viruses
hostDbPath = "/lustre/shared/wfsr-mcfa/projects/internships/luka/viral_metagenomics_pipeline/scripts/virushostdb.tsv"
hostdb_viruses = pd.read_csv(hostDbPath, sep="\t", usecols=['virus tax id', "refseq id", "virus lineage", "host name", "DISEASE"])

#select only viruses with human host
hostdb_viruses.query(" `host name` == 'Homo sapiens'", inplace= True)
human_viruses_taxids = [x for x in hostdb_viruses['virus tax id'] ]

In [35]:
kraken_human_viruses = kraken_df.query("taxid.isin(@human_viruses_taxids)")
kraken_human_viruses.head(40)

Unnamed: 0,coverage,clade_fragments,taxon_fragments,tax_code,taxid,scname,sample
1605,0.0,3,3,S,1960046,Hom-1 vesivirus,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1619,0.0,519,0,S,11983,Norwalk virus,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1620,0.0,475,49,S1,122929,Norovirus GII,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1621,0.0,376,376,S2,490039,Norovirus GII.2,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1622,0.0,50,50,S2,552592,Norovirus GII.17,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1681,0.0,497,497,S,194965,Aichivirus B,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
1753,0.0,1,1,S,145856,Human picobirnavirus,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
2017,0.0,24,24,S,11676,Human immunodeficiency vir...,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
2579,0.0,52,52,S,37296,Human gammaherpesvirus 8,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...
2591,0.0,96,96,S,10298,Human alphaherpesvirus 1,H7NKYDSX3_104762-001-003_81_GCCGTCGA-GTGATTAA_...


In [36]:
#how many unique viruses were obtained
unique_viruses = list(set(kraken_human_viruses['scname']))
len(unique_viruses)

93

In [37]:
unique_viruses

['                    Cowpox virus',
 '        Torque teno midi virus 12',
 '                        Adeno-associated virus - 3',
 '                    Human orthopneumovirus',
 '                      Human papillomavirus type 32',
 '                    Cercopithecine betaherpesvirus 5',
 '                    Husavirus sp.',
 '                      H-1 parvovirus',
 '                    Monkeypox virus',
 '        Torque teno midi virus 6',
 '                    Parvovirus NIH-CQV',
 '              Hepeviridae',
 '                      Human papillomavirus type 49',
 '                  Salivirus A',
 '                      Human papillomavirus 5',
 '                      Punta Toro virus',
 '                    Vaccinia virus',
 '                    Norovirus GII',
 '                    Aichi virus 1',
 '                  Human picobirnavirus',
 '                    Hom-1 vesivirus',
 '                    Papiine alphaherpesvirus 2',
 '                      Human papillomavirus 175',
 

### write out the reports to file (supplementary data)

In [38]:
#kraken all viruses
kraken_df.to_csv("../supplimentary_data/kraken_all_viruses.csv", columns=kraken_df.columns)

#kraken human viruses
kraken_human_viruses.to_csv("../supplimentary_data/kraken_human_viruses.csv",columns=kraken_human_viruses)

#kraken unique human viruses
unique_viruses_path = "../supplimentary_data/kraken_list_of_unique_human_viruses.txt"
with open(unique_viruses_path, "w") as fn:
    for index, vir in enumerate(unique_viruses):
        fn.write(f"{vir}\n")