# Extract HiFi QC Data<a class="tocSkip">

**This notebook reads in data from NTSM and ReadStats WDLS (stored in data tables). This is part of the HiFi QC process.**

**Below are the steps taken in this notebook:**
1. Import Statements & Global Variable Definitions
2. Define Functions
3. Read In Sample Names
4. Create Dataframe Of Files
5. Write data frame to data tables

# Import Statements & Global Variable Definitions

## Installs

In [1]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [2]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [3]:
from firecloud import fiss
import pandas as pd 
import numpy as np
import terra_pandas as tp
import os                 
import subprocess       
import re                 
import io
import gcsfs

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT


## Global Variable Declarations

In [4]:
# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"


# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_WRANGLING_WUSTL_HPRC_HiFi_Year2
Workspace storage bucket: gs://fc-089d894b-6682-450d-8031-f17732e5ece5/


# Extract NTSM Data

## Read in NTSM Data Table

In [5]:
ntsm_df = tp.table_to_dataframe("ntsm", workspace=WORKSPACE, workspace_namespace=PROJECT)

ntsm_df.head()

Unnamed: 0_level_0,ntsv_count_2,read_2_fastq,read_1_fastq,sample,ntsv_count_1,hifi,ntsm_eval_out
ntsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...
1,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00423,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...
10,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00609,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...
11,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00609,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...
12,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,gs://fc-47de7dae-e8e6-429c-b760-b4ba49136eee/1...,HG00738,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...


## Read NTSM Output & Write To DataFrame

In [44]:
ntsm_df['ntsm_score'] = np.nan
ntsm_df['result']     = np.nan

for index, row in ntsm_df.iterrows():

        sample_ntsm_fp = row['ntsm_eval_out']
        sample_ntsm_df = pd.read_csv(sample_ntsm_fp, header=None, sep='\t')

        ntsm_df.loc[index,'ntsm_score'] = sample_ntsm_df.iloc[0][2]
        ntsm_df.loc[index,'result'] = sample_ntsm_df[3].astype('str')[0]



In [42]:
## How many rows don't match??? (Should be 0)
sum(ntsm_df['result'] != 'Similar')

0

# Extract ReadStats Data

## Read in ReadStats Data Table

In [43]:
readstats_df = tp.table_to_dataframe("readstats", workspace=WORKSPACE, workspace_namespace=PROJECT)

readstats_df.head()

Unnamed: 0_level_0,ReadStatsTarball,hifi,ReadStatsReport,sample
readstats_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,HG00423
1,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,HG00423
10,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,HG00609
11,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,HG00609
12,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-089d894b-6682-450d-8031-f17732e5ece5/s...,HG00738


## Read ReadStats Output & Write To DataFrame

In [51]:
readstats_df['output']   = np.nan

for index, row in readstats_df.iterrows():

        sample_readstats_fp = row['ReadStatsReport']
        #sample_readstats_fn = os.path.basename(sample_readstats_fp)

        #! gsutil cp {sample_readstats_fp} .
        
        sample_readstats_df = pd.read_csv(sample_readstats_fp, header=None, sep='\t')

        ## Just look at sample-level metrics
        sample_readstats_df = sample_readstats_df[sample_readstats_df[0]=='sample.fastq']

        ## Get rid of extra row
        sample_readstats_df = sample_readstats_df.iloc[1: , :]


        sample_coverage = sample_readstats_df[sample_readstats_df[1] == 'total_Gbp'][2]
        readstats_df.loc[index,'output'] = float(sample_coverage.values[0])

        
readstats_df['coverage'] = readstats_df['output']/3.1

In [56]:
readstats_df.shape
len(readstats_df['sample'].unique())

25

In [55]:
# sum coverage by sample
for sample in (readstats_df['sample'].unique()):
    total_coverage = readstats_df.loc[readstats_df['sample'] == sample, 'coverage'].sum()
    if total_coverage < 35:
        print(sample, round(total_coverage,2))
# this should output nothing but gets 16 samples

HG00609 34.46
HG00738 34.06
HG01099 32.71
HG01255 32.52
HG01496 33.37
HG01934 32.09
HG01943 31.76
HG01981 34.0
HG01993 34.08
HG02280 32.41
HG02523 31.58
HG02602 32.55
HG02615 31.93
HG02698 31.44
HG03710 32.76
HG03831 30.86


# TODO: put these in a .csv