# Get read counts of fastq files

In this notebook we are going to count the number of reads assigned to each sample. To do that we are going to create a dataframe that contains all the information.

For ease purposes, we are going to create two dataframes: the first one related to the reads generated during the preprocessing step; and the second with the reads mapped to the different profilers.

In [None]:
import subprocess
import os
import pandas as pd

In [None]:
DATA_DIR = '../../data'
RESULTS_DIR = '../../results'

POOLS = [f'POOL{i}' for i in range(1, 13)]
CONTROLS = ['ACIDOLA', 'BLACTIS']

In [None]:
os.makedirs(f'{RESULTS_DIR}/counts', exist_ok=True)

## Getting counts of fastq files (host mapping)

In [None]:
def count_reads_in_fastq(file_path):
    # Path to the Bash script
    script_path = "./sh_funcs/count_reads.sh"

    # Call the script with the file path as an argument
    try:
        result = subprocess.run(
            [script_path, file_path], 
            text=True, 
            capture_output=True, 
            check=True
        )
        # The script's output is the number of reads
        return int(result.stdout.strip())
    except subprocess.CalledProcessError as e:
        print(f"Error occurred: {e.stderr}")
        return None

In [None]:
with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'w') as file:
    file.write(f'SAMPLE\traw\t1st_unmapped\t1st_mapped\t2nd_unmapped\t2nd_mapped\n')

In [None]:
dict_dict_reads = {}

In [None]:
dict_reads_ARTIFICIAL = {}

dict_reads_ARTIFICIAL['raw'] = count_reads_in_fastq(f'{DATA_DIR}/artificial_v2/artificial_reads_R1.fastq.gz')
dict_reads_ARTIFICIAL['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/artificial/ARTIFICIAL.unmapped_1.fastq.gz')
dict_reads_ARTIFICIAL['1st_mapped'] = dict_reads_ARTIFICIAL['raw'] - dict_reads_ARTIFICIAL['1st_unmapped']
dict_reads_ARTIFICIAL['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/artificial/ARTIFICIAL.unmapped.fastq.1.gz')
dict_reads_ARTIFICIAL['2nd_mapped'] = dict_reads_ARTIFICIAL['1st_unmapped'] - dict_reads_ARTIFICIAL['2nd_unmapped']

with open('{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
    file.write(f"ARTIFICIAL\t{dict_reads_ARTIFICIAL['raw']}\t{dict_reads_ARTIFICIAL['1st_unmapped']}\t{dict_reads_ARTIFICIAL['1st_mapped']}\t{dict_reads_ARTIFICIAL['2nd_unmapped']}\t{dict_reads_ARTIFICIAL['2nd_mapped']}\n")

dict_dict_reads['ARTIFICIAL'] = dict_reads_ARTIFICIAL

In [None]:
for control in CONTROLS:
    dict_reads_CONTROLS_i = {}

    dict_reads_CONTROLS_i['raw'] = count_reads_in_fastq(f'{DATA_DIR}/EM_EVPools/control_sample/{control}.fastq.gz')
    dict_reads_CONTROLS_i['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/controls/{control}.unmapped_1.fastq.gz')
    dict_reads_CONTROLS_i['1st_mapped'] = dict_reads_CONTROLS_i['raw'] - dict_reads_CONTROLS_i['1st_unmapped']
    dict_reads_CONTROLS_i['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/controls/{control}.unmapped.fastq.1.gz')
    dict_reads_CONTROLS_i['2nd_mapped'] = dict_reads_CONTROLS_i['1st_unmapped'] - dict_reads_CONTROLS_i['2nd_unmapped']

    with open('{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
        file.write(f"{control}\t{dict_reads_CONTROLS_i['raw']}\t{dict_reads_CONTROLS_i['1st_unmapped']}\t{dict_reads_CONTROLS_i['1st_mapped']}\t{dict_reads_CONTROLS_i['2nd_unmapped']}\t{dict_reads_CONTROLS_i['2nd_mapped']}\n")  

    dict_dict_reads[control] = dict_reads_CONTROLS_i  

In [None]:
for pool in POOLS:
    dict_reads_POOL_i = {}

    dict_reads_POOL_i['raw'] = count_reads_in_fastq(f'{DATA_DIR}/EM_EVPools/control_sample/{pool}.fastq.gz')
    dict_reads_POOL_i['1st_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/1stmap/pools/{pool}.unmapped_1.fastq.gz')
    dict_reads_POOL_i['1st_mapped'] = dict_reads_POOL_i['raw'] - dict_reads_POOL_i['1st_unmapped']
    dict_reads_POOL_i['2nd_unmapped'] = count_reads_in_fastq(f'{RESULTS_DIR}/2ndmap/pools/{pool}.unmapped.fastq.1.gz')
    dict_reads_POOL_i['2nd_mapped'] = dict_reads_POOL_i['1st_unmapped'] - dict_reads_POOL_i['2nd_unmapped']

    with open(f'{RESULTS_DIR}/counts/mapping_counts.txt', 'a') as file:
        file.write(f"{pool}\t{dict_reads_POOL_i['raw']}\t{dict_reads_POOL_i['1st_unmapped']}\t{dict_reads_POOL_i['1st_mapped']}\t{dict_reads_POOL_i['2nd_unmapped']}\t{dict_reads_POOL_i['2nd_mapped']}\n")  

    dict_dict_reads[pool] = dict_reads_POOL_i                                                                                             

## Get counts of profiling results

In this case we are going to build a dataframe with the following columns:
- SAMPLE
- PASS
- MODE
- PROFILER

And for each profiler we are going to read the standarized report adn extract the number of reads mapped in total, mapped to homo sapiens, and lastly extract the reads that were unmapped.

In [None]:
LIST_PROFILERS = ['centrifuge', 'ganon', 'kaiju', 'kmcp', 'kraken2', 'krakenuniq']

In [None]:
df_host_map_info = pd.read_csv(f'{RESULTS_DIR}/counts/mapping_counts.txt', sep='\t').set_index('SAMPLE')
df_host_map_info

In [None]:
def extract_read_info(taxpasta_file, basename, passn):
    df = pd.read_csv(taxpasta_file, sep='\t').set_index('taxonomy_id')

    if 9606 in df.index:
        counts_human = int(df.loc[9606, 'count'])
    else:
        counts_human = 0

    counts_others = int(df['count'].sum()) - counts_human
    if passn == 0:
        max_counts = df_host_map_info.loc[basename, 'raw']
    elif passn == 2:
        max_counts = df_host_map_info.loc[basename, '2nd_unmapped']

    counts_unmapped = int(max_counts) - (counts_human + counts_others)

    return counts_human, counts_others, counts_unmapped

In [None]:
counts_human, counts_others, counts_unmapped = extract_read_info(f'{RESULTS_DIR}/profiling/kraken2/pass0/ARTIFICIAL_mode3/ARTIFICIAL_mode3.report.standardised.species', 'ARTIFICIAL', 0)
counts_human, counts_others, counts_unmapped

In [None]:
with open(f'{RESULTS_DIR}/counts/profiling_counts.txt', 'w') as file:
        file.write(f'SAMPLE\tpass\tmode\tprofiler\tmapped_human\tmapped_others\tunmapped\n')  

In [None]:
sample = 'ARTIFICIAL'
for profiler in LIST_PROFILERS:
    for passn in [0, 2]:
        for mode in range(1, 10):
            if profiler in ['kaiju']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.results.standardised.species'
            elif profiler in ['krakenuniq', 'kraken2']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.report.standardised.species'
            elif profiler in ['centrifuge']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.kreport.standardised.species'
            elif profiler in ['kmcp']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.profile.standardised.species'
            elif profiler in ['ganon']:
                taxpasta_file = f'{RESULTS_DIR}/profiling/{profiler}/pass{passn}/{sample}_mode{mode}/{sample}_mode{mode}.rep.standardised.species'

            if os.path.isfile(taxpasta_file):
                counts_human, counts_others, counts_unmapped = extract_read_info(taxpasta_file, sample, passn)

                with open(f'{RESULTS_DIR}/counts/profiling_counts.txt', 'a') as file:
                    file.write(f'{sample}\t{passn}\t{mode}\t{profiler}\t{counts_human}\t{counts_others}\t{counts_unmapped}\n')
            else:
                print(f"{taxpasta_file} not present!")


# How many reads are incorrectly mapped if we do not perfom a host mapping step?

It has been reported that not mapping to human databases before profiling increases the number of reads assigned to other organisms. In this case, we are going to check this with the *in silico* dataset, where we know the original number of reads in human.

In [None]:
artificial_taxid_counts = pd.read_csv('table_artificial_taxid.csv', sep=';', names=['species', 'taxid', 'reads'])
artificial_taxid_counts['reads_true'] = (artificial_taxid_counts['reads'] / 2).astype(int)

n_true_human_reads = int(artificial_taxid_counts['reads_true'].iloc[0])
n_true_human_reads

In [None]:
n_mapped_reads_1and2_maps = df_host_map_info.loc['ARTIFICIAL', '1st_mapped'] + df_host_map_info.loc['ARTIFICIAL', '2nd_mapped']

print(f'There is a total of {n_mapped_reads_1and2_maps} reads mapped to human during the 1st and 2nd map, which represents around {100 * n_mapped_reads_1and2_maps/n_true_human_reads} %.')
print(f'There is a total of {n_true_human_reads - n_mapped_reads_1and2_maps} reads remaining to be mapped.')

In [None]:
df_host_profile_info = pd.read_csv(f'{RESULTS_DIR}/counts/profiling_counts.txt', sep='\t')
df_host_profile_info_artificial = df_host_profile_info[df_host_profile_info['SAMPLE'] == 'ARTIFICIAL']

df_host_profile_info_artificial['mapped_human_1_2_maps'] = 0
df_host_profile_info_artificial.loc[df_host_profile_info_artificial['pass'] == 2, 'mapped_human_1_2_maps'] = n_mapped_reads_1and2_maps

df_host_profile_info_artificial['mapped_human_total'] = df_host_profile_info_artificial['mapped_human_1_2_maps'] + df_host_profile_info_artificial['mapped_human']
df_host_profile_info_artificial['total_reads'] = df_host_profile_info_artificial['mapped_human_total'] + df_host_profile_info_artificial['mapped_others'] + df_host_profile_info_artificial['unmapped']

df_host_profile_info_artificial['observed_human_prop'] = df_host_profile_info_artificial['mapped_human_total'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_others_prop'] = df_host_profile_info_artificial['mapped_others'] / df_host_profile_info_artificial['total_reads']
df_host_profile_info_artificial['observed_unmapped_prop'] = df_host_profile_info_artificial['unmapped'] / df_host_profile_info_artificial['total_reads']

df_host_profile_info_artificial['expected_human_prop'] = n_true_human_reads / artificial_taxid_counts['reads_true'].sum() # 0.8
df_host_profile_info_artificial['calculated_unmapped_human_prop'] = df_host_profile_info_artificial['expected_human_prop'] - df_host_profile_info_artificial['observed_human_prop']
df_host_profile_info_artificial['calculated_unmapped_others_prop'] = df_host_profile_info_artificial['observed_unmapped_prop'] - df_host_profile_info_artificial['calculated_unmapped_human_prop']

df_host_profile_info_artificial[df_host_profile_info_artificial['profiler'] == 'kaiju']


In [None]:
# PENSAR COMO GRAFICAR ESTO