# Extract HiFi QC Data<a class="tocSkip">

**This notebook reads in data from NTSM and ReadStats WDLS (stored in data tables). This is part of the HiFi QC process.**

**Below are the steps taken in this notebook:**
1. Import Statements & Global Variable Definitions
2. Define Functions
3. Read In Sample Names
4. Create Dataframe Of Files
5. Write data frame to data tables

# Import Statements & Global Variable Definitions

## Installs

In [1]:
%%capture
%pip install gcsfs
## capture CANNOT have comments above it
## For reading CSVs stored in Google Cloud (without downloading them first)
## May need to restart kernel after install 

In [2]:
%%capture
%pip install --upgrade --no-cache-dir --force-reinstall terra-pandas
%pip install --upgrade --no-cache-dir  --force-reinstall git+https://github.com/DataBiosphere/terra-notebook-utils
## For reading/writing data tables into pandas data frames
## May need to restart kernel after install 

## Import Statements

In [3]:
from firecloud import fiss
import pandas as pd 
import numpy as np
import terra_pandas as tp
import os                 
import subprocess       
import re                 
import io
import gcsfs

from typing import Any, Callable, List, Optional
from terra_notebook_utils import table, WORKSPACE_NAME, WORKSPACE_GOOGLE_PROJECT


## Global Variable Declarations

In [4]:
# Get the Google billing project name and workspace name for current workspace
PROJECT = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE =os.path.basename(os.path.dirname(os.getcwd()))
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

# Verify that we've captured the environment variables
print("Billing project: " + PROJECT)
print("Workspace: " + WORKSPACE)
print("Workspace storage bucket: " + bucket)

Billing project: human-pangenome-ucsc
Workspace: HPRC_WRANGLING_WUSTL_HPRC_HiFi_Year1
Workspace storage bucket: gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/


# Extract NTSM Data

## Read in NTSM Data Table

In [5]:
ntsm_df = tp.table_to_dataframe("ntsm", workspace=WORKSPACE, workspace_namespace=PROJECT)

ntsm_df.head()

Unnamed: 0_level_0,ntsv_count_2,sample,ntsv_count_1,hifi,1000g_cram,ntsm_eval_out
ntsm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00438,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/C...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...
1,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00438,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/C...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...
10,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00673,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/C...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...
11,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00673,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/C...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...
12,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00735,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-56ac46ea-efc4-4683-b6d5-6d95bed41c5e/C...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...


## Read NTSM Output & Write To DataFrame

In [6]:
ntsm_df['ntsm_score'] = np.nan
ntsm_df['result']     = np.nan

for index, row in ntsm_df.iterrows():

        sample_ntsm_fp = row['ntsm_eval_out']
        sample_ntsm_df = pd.read_csv(sample_ntsm_fp, header=None, sep='\t')

        ntsm_df.loc[index,'ntsm_score'] = sample_ntsm_df.iloc[0][2]
        ntsm_df.loc[index,'result'] = sample_ntsm_df[3].astype('str')[0]



In [7]:
## How many rows don't match??? (Should be 0)
sum(ntsm_df['result'] != 'Similar')

0

# Extract ReadStats Data

## Read in ReadStats Data Table

In [8]:
readstats_df = tp.table_to_dataframe("readstats", workspace=WORKSPACE, workspace_namespace=PROJECT)

readstats_df.head()

Unnamed: 0_level_0,ReadStatsTarball,hifi,ReadStatsReport,sample
readstats_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00438
1,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00438
10,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00673
11,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00673
12,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/s...,gs://fc-888bd569-6e57-439d-b69d-943f184cf8cb/s...,HG00735


## Read ReadStats Output & Write To DataFrame

In [9]:
readstats_df['output']   = np.nan

for index, row in readstats_df.iterrows():

        sample_readstats_fp = row['ReadStatsReport']
        #sample_readstats_fn = os.path.basename(sample_readstats_fp)

        #! gsutil cp {sample_readstats_fp} .
        
        sample_readstats_df = pd.read_csv(sample_readstats_fp, header=None, sep='\t')

        ## Just look at sample-level metrics
        sample_readstats_df = sample_readstats_df[sample_readstats_df[0]=='sample.fastq']

        ## Get rid of extra row
        sample_readstats_df = sample_readstats_df.iloc[1: , :]


        sample_coverage = sample_readstats_df[sample_readstats_df[1] == 'total_Gbp'][2]
        readstats_df.loc[index,'output'] = float(sample_coverage.values[0])

        
readstats_df['coverage'] = readstats_df['output']/3.1

In [10]:
readstats_df.shape
len(readstats_df['sample'].unique())

20

In [11]:
# sum coverage by sample
for sample in (readstats_df['sample'].unique()):
    total_coverage = readstats_df.loc[readstats_df['sample'] == sample, 'coverage'].sum()
    if total_coverage < 35:
        print(sample, round(total_coverage,2))
# this should output nothing and does