# Get read counts of fastq files

In this notebook we are going to count the number of reads assigned to each sample. To do that we are going to create a dataframe that contains all the information.

For ease purposes, we are going to create two dataframes: the first one related to the reads generated during the preprocessing step; and the second with the reads mapped to the different profilers.

In [2]:
import subprocess
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('')

In [3]:
from list_vars import LIST_PROFILERS, DATA_DIR, RESULTS_DIR, POOLS, CONTROLS, DPI

In [4]:
os.makedirs(f'{RESULTS_DIR}/counts', exist_ok=True)

## Getting counts of fastq files (host mapping)

In [5]:
def count_reads_in_fastq(file_path):
    # Path to the Bash script
    script_path = "./sh_funcs/count_reads.sh"

    # Call the script with the file path as an argument
    try:
        result = subprocess.run(
            [script_path, file_path], 
            text=True, 
            capture_output=True, 
            check=True
        )
        # The script's output is the number of reads
        return int(result.stdout.strip())
    except subprocess.CalledProcessError as e:
        print(f"Error occurred: {e.stderr}")
        return None

In [6]:
with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'w') as file:
    file.write(f'SAMPLE\traw\t1st_unmapped\t1st_mapped\t2nd_unmapped\t2nd_mapped\n')

In [7]:
dict_dict_reads = {}

In [None]:
dict_reads_ARTIFICIAL = {}

dict_reads_ARTIFICIAL['raw'] = count_reads_in_fastq(f'{DATA_DIR}/artificial_v2/artificial_reads_R1.fastq.gz')
dict_reads_ARTIFICIAL['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/artificial/ARTIFICIAL.unmapped_1.fastq.gz')
dict_reads_ARTIFICIAL['1st_mapped'] = dict_reads_ARTIFICIAL['raw'] - dict_reads_ARTIFICIAL['1st_unmapped']
dict_reads_ARTIFICIAL['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/artificial/ARTIFICIAL.unmapped.fastq.1.gz')
dict_reads_ARTIFICIAL['2nd_mapped'] = dict_reads_ARTIFICIAL['1st_unmapped'] - dict_reads_ARTIFICIAL['2nd_unmapped']

with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
    file.write(f"ARTIFICIAL\t{dict_reads_ARTIFICIAL['raw']}\t{dict_reads_ARTIFICIAL['1st_unmapped']}\t{dict_reads_ARTIFICIAL['1st_mapped']}\t{dict_reads_ARTIFICIAL['2nd_unmapped']}\t{dict_reads_ARTIFICIAL['2nd_mapped']}\n")

dict_dict_reads['ARTIFICIAL'] = dict_reads_ARTIFICIAL

In [None]:
for control in CONTROLS:
    dict_reads_CONTROLS_i = {}

    dict_reads_CONTROLS_i['raw'] = count_reads_in_fastq(f'{DATA_DIR}/EM_EVPools/control_sample/{control}.fastq.gz')
    dict_reads_CONTROLS_i['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/controls/{control}.unmapped_1.fastq.gz')
    dict_reads_CONTROLS_i['1st_mapped'] = dict_reads_CONTROLS_i['raw'] - dict_reads_CONTROLS_i['1st_unmapped']
    dict_reads_CONTROLS_i['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/controls/{control}.unmapped.fastq.1.gz')
    dict_reads_CONTROLS_i['2nd_mapped'] = dict_reads_CONTROLS_i['1st_unmapped'] - dict_reads_CONTROLS_i['2nd_unmapped']

    with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
        file.write(f"{control}\t{dict_reads_CONTROLS_i['raw']}\t{dict_reads_CONTROLS_i['1st_unmapped']}\t{dict_reads_CONTROLS_i['1st_mapped']}\t{dict_reads_CONTROLS_i['2nd_unmapped']}\t{dict_reads_CONTROLS_i['2nd_mapped']}\n")  

    dict_dict_reads[control] = dict_reads_CONTROLS_i  

In [None]:
for pool in POOLS:
    dict_reads_POOL_i = {}

    dict_reads_POOL_i['raw'] = count_reads_in_fastq(f'{DATA_DIR}/EM_EVPools/control_sample/{pool}.fastq.gz')
    dict_reads_POOL_i['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/pools/{pool}.unmapped_1.fastq.gz')
    dict_reads_POOL_i['1st_mapped'] = dict_reads_POOL_i['raw'] - dict_reads_POOL_i['1st_unmapped']
    dict_reads_POOL_i['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/pools/{pool}.unmapped.fastq.1.gz')
    dict_reads_POOL_i['2nd_mapped'] = dict_reads_POOL_i['1st_unmapped'] - dict_reads_POOL_i['2nd_unmapped']

    with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
        file.write(f"{pool}\t{dict_reads_POOL_i['raw']}\t{dict_reads_POOL_i['1st_unmapped']}\t{dict_reads_POOL_i['1st_mapped']}\t{dict_reads_POOL_i['2nd_unmapped']}\t{dict_reads_POOL_i['2nd_mapped']}\n")  

    dict_dict_reads[pool] = dict_reads_POOL_i                                                                                             

## Get counts of profiling results

In this case we are going to build a dataframe with the following columns:
- SAMPLE
- PASS
- MODE
- PROFILER

And for each profiler we are going to read the standarized report adn extract the number of reads mapped in total, mapped to homo sapiens, and lastly extract the reads that were unmapped.

In [None]:
df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')
df_host_map_info

In [11]:
def extract_read_info(taxpasta_file, basename, passn):
    df = pd.read_csv(taxpasta_file, sep='\t').set_index('taxonomy_id')

    if 9606 in df.index:
        counts_human = int(df.loc[9606, 'count'])
    else:
        counts_human = 0

    counts_others = int(df['count'].sum()) - counts_human
    if passn == 0:
        max_counts = df_host_map_info.loc[basename, 'raw']
    elif passn == 2:
        max_counts = df_host_map_info.loc[basename, '2nd_unmapped']

    counts_unmapped = int(max_counts) - (counts_human + counts_others)

    return counts_human, counts_others, counts_unmapped

In [None]:
counts_human, counts_others, counts_unmapped = extract_read_info(f'{RESULTS_DIR}/profiling/kraken2/pass0/ARTIFICIAL_mode3/ARTIFICIAL_mode3.report.standardised.species', 'ARTIFICIAL', 0)
counts_human, counts_others, counts_unmapped

In [8]:
sample = 'ARTIFICIAL'
with open(f'{RESULTS_DIR}/counts/profiling_counts_{sample}.txt', 'w') as file:
        file.write(f'SAMPLE\tpass\tmode\tprofiler\tmapped_human\tmapped_others\tunmapped\n')  

for profiler in LIST_PROFILERS:
    for passn in [0, 2]:
        for mode in range(1, 10):
            if profiler in ['kaiju']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.results.standardised.species'
            elif profiler in ['krakenuniq', 'kraken2']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.report.standardised.species'
            elif profiler in ['centrifuge']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.kreport.standardised.species'
            elif profiler in ['kmcp']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.profile.standardised.species'
            elif profiler in ['ganon']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.report.standardised.species'

            if os.path.isfile(taxpasta_file):
                counts_human, counts_others, counts_unmapped = extract_read_info(taxpasta_file, sample, passn)

                with open(f'{RESULTS_DIR}/counts/profiling_counts_{sample}.txt', 'a') as file:
                    file.write(f'{sample}\t{passn}\t{mode}\t{profiler}\t{counts_human}\t{counts_others}\t{counts_unmapped}\n')
            else:
                print(f"{taxpasta_file} not present!")


In [9]:
for sample in CONTROLS + POOLS:
    with open(f'{RESULTS_DIR}/counts/profiling_counts_{sample}.txt', 'w') as file:
        file.write(f'SAMPLE\tpass\tmode\tprofiler\tmapped_human\tmapped_others\tunmapped\n')

    for profiler in LIST_PROFILERS:
        for passn in [2]:
            for mode in [3, 5, 7]:
                if profiler in ['kaiju']:
                    taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.results.standardised.genus'
                elif profiler in ['krakenuniq', 'kraken2']:
                    taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.report.standardised.genus'
                elif profiler in ['centrifuge']:
                    taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.kreport.standardised.genus'
                elif profiler in ['kmcp']:
                    taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.profile.standardised.genus'
                elif profiler in ['ganon']:
                    taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.report.standardised.genus'

                if os.path.isfile(taxpasta_file):
                    counts_human, counts_others, counts_unmapped = extract_read_info(taxpasta_file, sample, passn)

                    with open(f'{RESULTS_DIR}/counts/profiling_counts_{sample}.txt', 'a') as file:
                        file.write(f'{sample}\t{passn}\t{mode}\t{profiler}\t{counts_human}\t{counts_others}\t{counts_unmapped}\n')
                else:
                    print(f"{taxpasta_file} not present!")