# Genomic Data Processing Pipeline
This notebook processes genomic VCF data using Hail for machine learning preparation.
## Setup and Requirements
- Hail
- Pandas
- Python 3.9+
## Pipeline Overview
1. Data Loading
2. Quality Control
3. Feature Extraction
## 1. Initial Setup
First, we'll import our required libraries and set up logging utilities.

## Initialize Hail for local processing

In [None]:
from src.utils.hail_utils import initialize_hail
from src.preprocessing.vcf_loader import load_vcf
from src.utils.logging_utils import log_step

# Initialize Hail
initialize_hail()

# Load your VCF file
vcf_path = r'C:\Users\jmcdonald\Desktop\Projects\The Genome Project\gnomad.exomes.v4.1.sites.chr13.vcf.bgz'
mt = load_vcf(vcf_path)

# Continue with analysis...

Initializing Hail with default parameters...
Running on Apache Spark version 3.5.4
SparkUI available at http://LT2058-20241028.SHTC.com:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.133-4c60fddb171a
LOGGING: writing to c:\Users\jmcdonald\source\PersonalRepos\genome-analysis-ml\notebooks\hail-20250114-1513-0.2.133-4c60fddb171a.log


Hail is already initialized


In [16]:
# # Configure your VCF file path
# vcf_path = r'C:\Users\jmcdonald\Desktop\Projects\The Genome Project\gnomad.exomes.v4.1.sites.chr13.vcf.bgz'
# print(f"Checking if file exists: {os.path.exists(vcf_path)}")

# # Check file existence and size
# print(f"Checking file details:")
# print(f"File exists: {os.path.exists(vcf_path)}")
# if os.path.exists(vcf_path):
#     print(f"File size: {os.path.getsize(vcf_path) / (1024*1024):.2f} MB")

# # Try loading with force_bgz and debug flags
# log_step("Loading VCF file with debug options...")
# try:
#     mt = hl.import_vcf(
#         vcf_path,
#         force_bgz=True,
#         reference_genome='GRCh38',
#         skip_invalid_loci=True,
#         min_partitions=8
#     )
    
#     # Check if loading worked
#     print("\nMatrix table schema:")
#     mt.describe()
    
#     # Try counting variants
#     n_variants = mt.count()
#     print(f"\nLoaded variants: {n_variants[0]:,}")
#     print(f"Number of samples: {n_variants[1]:,}")
    
# except Exception as e:
#     print(f"Error loading VCF: {e}")
    
#     # If that fails, try with different options
#     print("\nTrying alternative loading options...")
#     try:
#         mt = hl.import_vcf(
#             vcf_path,
#             force_bgz=True,
#             reference_genome='GRCh38',
#             skip_invalid_loci=True,
#             header_file=vcf_path  # Explicitly specify header file
#         )
#         print("Alternative loading succeeded")
#         n_variants = mt.count()
#         print(f"Loaded variants: {n_variants[0]:,}")
#     except Exception as e2:
#         print(f"Alternative loading also failed: {e2}")

Checking if file exists: True
Checking file details:
File exists: True
File size: 3376.23 MB

[13:03:00] Loading VCF file with debug options...

Matrix table schema:
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AN: int32, 
        AF: array<float64>, 
        grpmax: array<str>, 
        fafmax_faf95_max: array<float64>, 
        fafmax_faf95_max_gen_anc: array<str>, 
        AC_XX: array<int32>, 
        AF_XX: array<float64>, 
        AN_XX: int32, 
        nhomalt_XX: array<int32>, 
        AC_XY: array<int32>, 
        AF_XY: array<float64>, 
        AN_XY: int32, 
        nhomalt_XY: array<int32>, 
        nhomalt: array<int32>, 
        AC_afr_XX: array<int32>, 
        AF_a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info
locus,alleles,rsid,qual,filters,AC,AN,AF,grpmax,fafmax_faf95_max,fafmax_faf95_max_gen_anc,AC_XX,AF_XX,AN_XX,nhomalt_XX,AC_XY,AF_XY,AN_XY,nhomalt_XY,nhomalt,AC_afr_XX,AF_afr_XX,AN_afr_XX,nhomalt_afr_XX,AC_afr_XY,AF_afr_XY,AN_afr_XY,nhomalt_afr_XY,AC_afr,AF_afr,AN_afr,nhomalt_afr,AC_amr_XX,AF_amr_XX,AN_amr_XX,nhomalt_amr_XX,AC_amr_XY,AF_amr_XY,AN_amr_XY,nhomalt_amr_XY,AC_amr,AF_amr,AN_amr,nhomalt_amr,AC_asj_XX,AF_asj_XX,AN_asj_XX,nhomalt_asj_XX,AC_asj_XY,AF_asj_XY,AN_asj_XY,nhomalt_asj_XY,AC_asj,AF_asj,AN_asj,nhomalt_asj,AC_eas_XX,AF_eas_XX,AN_eas_XX,nhomalt_eas_XX,AC_eas_XY,AF_eas_XY,AN_eas_XY,nhomalt_eas_XY,AC_eas,AF_eas,AN_eas,nhomalt_eas,AC_fin_XX,AF_fin_XX,AN_fin_XX,nhomalt_fin_XX,AC_fin_XY,AF_fin_XY,AN_fin_XY,nhomalt_fin_XY,AC_fin,AF_fin,AN_fin,nhomalt_fin,AC_mid_XX,AF_mid_XX,AN_mid_XX,nhomalt_mid_XX,AC_mid_XY,AF_mid_XY,AN_mid_XY,nhomalt_mid_XY,AC_mid,AF_mid,AN_mid,nhomalt_mid,AC_nfe_XX,AF_nfe_XX,AN_nfe_XX,nhomalt_nfe_XX,AC_nfe_XY,AF_nfe_XY,AN_nfe_XY,nhomalt_nfe_XY,AC_nfe,AF_nfe,AN_nfe,nhomalt_nfe,AC_non_ukb_XX,AF_non_ukb_XX,AN_non_ukb_XX,nhomalt_non_ukb_XX,AC_non_ukb_XY,AF_non_ukb_XY,AN_non_ukb_XY,nhomalt_non_ukb_XY,AC_non_ukb,AF_non_ukb,AN_non_ukb,nhomalt_non_ukb,AC_non_ukb_afr_XX,AF_non_ukb_afr_XX,AN_non_ukb_afr_XX,nhomalt_non_ukb_afr_XX,AC_non_ukb_afr_XY,AF_non_ukb_afr_XY,AN_non_ukb_afr_XY,nhomalt_non_ukb_afr_XY,AC_non_ukb_afr,AF_non_ukb_afr,AN_non_ukb_afr,nhomalt_non_ukb_afr,AC_non_ukb_amr_XX,AF_non_ukb_amr_XX,AN_non_ukb_amr_XX,nhomalt_non_ukb_amr_XX,AC_non_ukb_amr_XY,AF_non_ukb_amr_XY,AN_non_ukb_amr_XY,nhomalt_non_ukb_amr_XY,AC_non_ukb_amr,AF_non_ukb_amr,AN_non_ukb_amr,nhomalt_non_ukb_amr,AC_non_ukb_asj_XX,AF_non_ukb_asj_XX,AN_non_ukb_asj_XX,nhomalt_non_ukb_asj_XX,AC_non_ukb_asj_XY,AF_non_ukb_asj_XY,AN_non_ukb_asj_XY,nhomalt_non_ukb_asj_XY,AC_non_ukb_asj,AF_non_ukb_asj,AN_non_ukb_asj,nhomalt_non_ukb_asj,AC_non_ukb_eas_XX,AF_non_ukb_eas_XX,AN_non_ukb_eas_XX,nhomalt_non_ukb_eas_XX,AC_non_ukb_eas_XY,AF_non_ukb_eas_XY,AN_non_ukb_eas_XY,nhomalt_non_ukb_eas_XY,AC_non_ukb_eas,AF_non_ukb_eas,AN_non_ukb_eas,nhomalt_non_ukb_eas,AC_non_ukb_fin_XX,AF_non_ukb_fin_XX,AN_non_ukb_fin_XX,nhomalt_non_ukb_fin_XX,AC_non_ukb_fin_XY,AF_non_ukb_fin_XY,AN_non_ukb_fin_XY,nhomalt_non_ukb_fin_XY,AC_non_ukb_fin,AF_non_ukb_fin,AN_non_ukb_fin,nhomalt_non_ukb_fin,AC_non_ukb_mid_XX,AF_non_ukb_mid_XX,AN_non_ukb_mid_XX,nhomalt_non_ukb_mid_XX,AC_non_ukb_mid_XY,AF_non_ukb_mid_XY,AN_non_ukb_mid_XY,nhomalt_non_ukb_mid_XY,AC_non_ukb_mid,AF_non_ukb_mid,AN_non_ukb_mid,nhomalt_non_ukb_mid,AC_non_ukb_nfe_XX,AF_non_ukb_nfe_XX,AN_non_ukb_nfe_XX,nhomalt_non_ukb_nfe_XX,AC_non_ukb_nfe_XY,AF_non_ukb_nfe_XY,AN_non_ukb_nfe_XY,nhomalt_non_ukb_nfe_XY,AC_non_ukb_nfe,AF_non_ukb_nfe,AN_non_ukb_nfe,nhomalt_non_ukb_nfe,AC_non_ukb_raw,AF_non_ukb_raw,AN_non_ukb_raw,nhomalt_non_ukb_raw,AC_non_ukb_remaining_XX,AF_non_ukb_remaining_XX,AN_non_ukb_remaining_XX,nhomalt_non_ukb_remaining_XX,AC_non_ukb_remaining_XY,AF_non_ukb_remaining_XY,AN_non_ukb_remaining_XY,nhomalt_non_ukb_remaining_XY,AC_non_ukb_remaining,AF_non_ukb_remaining,AN_non_ukb_remaining,nhomalt_non_ukb_remaining,AC_non_ukb_sas_XX,AF_non_ukb_sas_XX,AN_non_ukb_sas_XX,nhomalt_non_ukb_sas_XX,AC_non_ukb_sas_XY,AF_non_ukb_sas_XY,AN_non_ukb_sas_XY,nhomalt_non_ukb_sas_XY,AC_non_ukb_sas,AF_non_ukb_sas,AN_non_ukb_sas,nhomalt_non_ukb_sas,AC_raw,AF_raw,AN_raw,nhomalt_raw,AC_remaining_XX,AF_remaining_XX,AN_remaining_XX,nhomalt_remaining_XX,AC_remaining_XY,AF_remaining_XY,AN_remaining_XY,nhomalt_remaining_XY,AC_remaining,AF_remaining,AN_remaining,nhomalt_remaining,AC_sas_XX,AF_sas_XX,AN_sas_XX,nhomalt_sas_XX,AC_sas_XY,AF_sas_XY,AN_sas_XY,nhomalt_sas_XY,AC_sas,AF_sas,AN_sas,nhomalt_sas,AC_grpmax,AF_grpmax,AN_grpmax,nhomalt_grpmax,grpmax_non_ukb,AC_grpmax_non_ukb,AF_grpmax_non_ukb,AN_grpmax_non_ukb,nhomalt_grpmax_non_ukb,faf95_XX,faf95_XY,faf95,faf95_afr_XX,faf95_afr_XY,faf95_afr,faf95_amr_XX,faf95_amr_XY,faf95_amr,faf95_eas_XX,faf95_eas_XY,faf95_eas,faf95_mid_XX,faf95_mid_XY,faf95_mid,faf95_nfe_XX,faf95_nfe_XY,faf95_nfe,faf95_non_ukb_XX,faf95_non_ukb_XY,faf95_non_ukb,faf95_non_ukb_afr_XX,faf95_non_ukb_afr_XY,faf95_non_ukb_afr,faf95_non_ukb_amr_XX,faf95_non_ukb_amr_XY,faf95_non_ukb_amr,faf95_non_ukb_eas_XX,faf95_non_ukb_eas_XY,faf95_non_ukb_eas,faf95_non_ukb_mid_XX,faf95_non_ukb_mid_XY,faf95_non_ukb_mid,faf95_non_ukb_nfe_XX,faf95_non_ukb_nfe_XY,faf95_non_ukb_nfe,faf95_non_ukb_sas_XX,faf95_non_ukb_sas_XY,faf95_non_ukb_sas,faf95_sas_XX,faf95_sas_XY,faf95_sas,faf99_XX,faf99_XY,faf99,faf99_afr_XX,faf99_afr_XY,faf99_afr,faf99_amr_XX,faf99_amr_XY,faf99_amr,faf99_eas_XX,faf99_eas_XY,faf99_eas,faf99_mid_XX,faf99_mid_XY,faf99_mid,faf99_nfe_XX,faf99_nfe_XY,faf99_nfe,faf99_non_ukb_XX,faf99_non_ukb_XY,faf99_non_ukb,faf99_non_ukb_afr_XX,faf99_non_ukb_afr_XY,faf99_non_ukb_afr,faf99_non_ukb_amr_XX,faf99_non_ukb_amr_XY,faf99_non_ukb_amr,faf99_non_ukb_eas_XX,faf99_non_ukb_eas_XY,faf99_non_ukb_eas,faf99_non_ukb_mid_XX,faf99_non_ukb_mid_XY,faf99_non_ukb_mid,faf99_non_ukb_nfe_XX,faf99_non_ukb_nfe_XY,faf99_non_ukb_nfe,faf99_non_ukb_sas_XX,faf99_non_ukb_sas_XY,faf99_non_ukb_sas,faf99_sas_XX,faf99_sas_XY,faf99_sas,fafmax_faf99_max,fafmax_faf99_max_gen_anc,fafmax_faf95_max_non_ukb,fafmax_faf95_max_gen_anc_non_ukb,fafmax_faf99_max_non_ukb,fafmax_faf99_max_gen_anc_non_ukb,age_hist_het_bin_freq,age_hist_het_n_smaller,age_hist_het_n_larger,age_hist_hom_bin_freq,age_hist_hom_n_smaller,age_hist_hom_n_larger,FS,MQ,MQRankSum,QUALapprox,QD,ReadPosRankSum,SOR,VarDP,monoallelic,only_het,transmitted_singleton,sibling_singleton,AS_FS,AS_MQ,AS_MQRankSum,AS_pab_max,AS_QUALapprox,AS_QD,AS_ReadPosRankSum,AS_SB_TABLE,AS_SOR,AS_VarDP,inbreeding_coeff,AS_culprit,AS_VQSLOD,negative_train_site,positive_train_site,allele_type,n_alt_alleles,variant_type,was_mixed,lcr,non_par,segdup,fail_interval_qc,outside_ukb_capture_region,outside_broad_capture_region,gq_hist_alt_bin_freq,gq_hist_all_bin_freq,dp_hist_alt_bin_freq,dp_hist_alt_n_larger,dp_hist_all_bin_freq,dp_hist_all_n_larger,ab_hist_alt_bin_freq,cadd_raw_score,cadd_phred,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,VRS_Allele_IDs,VRS_Starts,VRS_Ends,VRS_States,vep
locus<GRCh38>,array<str>,str,float64,set<str>,array<int32>,int32,array<float64>,array<str>,array<float64>,array<str>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,int32,array<int32>,array<int32>,array<float64>,array<int32>,array<int32>,array<str>,array<int32>,array<float64>,array<int32>,array<int32>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<str>,array<float64>,array<str>,array<float64>,array<str>,array<str>,array<int32>,array<int32>,array<str>,array<int32>,array<int32>,float64,float64,float64,int32,float64,float64,float64,int32,bool,bool,bool,bool,array<float64>,array<float64>,array<float64>,array<float64>,array<int32>,array<float64>,array<float64>,array<str>,array<float64>,array<int32>,array<float64>,array<str>,array<float64>,bool,bool,str,int32,str,bool,bool,bool,bool,bool,bool,bool,array<str>,array<str>,array<str>,array<int32>,array<str>,array<int32>,array<str>,float64,float64,float64,float64,float64,float64,float64,float64,array<str>,array<int32>,array<int32>,array<str>,array<str>
chr13:18173860,"[""A"",""C""]",,,"{""AS_VQSR""}",[1],875186,[1.14e-06],"[""nfe""]",,,[1],[2.34e-06],426726,[0],[0],[0.00e+00],448460,[0],[0],[0],[0.00e+00],11142,[0],[0],[0.00e+00],8692,[0],[0],[0.00e+00],19834,[0],[0],[0.00e+00],14468,[0],[0],[0.00e+00],11974,[0],[0],[0.00e+00],26442,[0],[0],[0.00e+00],9156,[0],[0],[0.00e+00],10256,[0],[0],[0.00e+00],19412,[0],[0],[0.00e+00],16930,[0],[0],[0.00e+00],16202,[0],[0],[0.00e+00],33132,[0],[0],[0.00e+00],18970,[0],[0],[0.00e+00],17246,[0],[0],[0.00e+00],36216,[0],[0],[0.00e+00],1156,[0],[0],[0.00e+00],1668,[0],[0],[0.00e+00],2824,[0],[1],[3.13e-06],319082,[0],[0],[0.00e+00],314692,[0],[1],[1.58e-06],633774,[0],[0],[0.00e+00],232774,[0],[0],[0.00e+00],274308,[0],[0],[0.00e+00],507082,[0],[0],[0.00e+00],7634,[0],[0],[0.00e+00],5888,[0],[0],[0.00e+00],13522,[0],[0],[0.00e+00],14228,[0],[0],[0.00e+00],11814,[0],[0],[0.00e+00],26042,[0],[0],[0.00e+00],8040,[0],[0],[0.00e+00],9250,[0],[0],[0.00e+00],17290,[0],[0],[0.00e+00],15952,[0],[0],[0.00e+00],15684,[0],[0],[0.00e+00],31636,[0],[0],[0.00e+00],18864,[0],[0],[0.00e+00],17232,[0],[0],[0.00e+00],36096,[0],[0],[0.00e+00],924,[0],[0],[0.00e+00],1214,[0],[0],[0.00e+00],2138,[0],[0],[0.00e+00],141082,[0],[0],[0.00e+00],155202,[0],[0],[0.00e+00],296284,[0],[1],[1.59e-06],628780,[0],[0],[0.00e+00],14336,[0],[0],[0.00e+00],13612,[0],[0],[0.00e+00],27948,[0],[0],[0.00e+00],11714,[0],[0],[0.00e+00],44412,[0],[0],[0.00e+00],56126,[0],[4],[2.90e-06],1380426,[0],[0],[0.00e+00],20742,[0],[0],[0.00e+00],19184,[0],[0],[0.00e+00],39926,[0],[0],[0.00e+00],15080,[0],[0],[0.00e+00],48546,[0],[0],[0.00e+00],63626,[0],[1],[1.58e-06],[633774],[0],,,,,,,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,,,,,"[""0|0|0|0|0|1|0|0|0|0""]",[0],[0],"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],0.0,32.0,-0.253,369,7.85,0.619,0.299,47,False,False,False,False,[0.00e+00],[2.87e+01],[-2.53e-01],[1.00e+00],[159],[5.48e+00],[9.67e-01],"[""32"",""2|8"",""0""]",[5.12e-01],[29],[-2.90e-06],"[""AS_MQRankSum""]",[-3.12e+00],True,False,"""snv""",3,"""multi-snv""",False,False,False,True,False,True,True,"[""0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0""]","[""0|0|0|0|26958|0|110943|0|268219|0|18209|1|13263|0|0|0|0|0|0|0""]","[""0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",[0],"[""0|0|134723|73328|39075|27443|23598|21588|19794|17322|14397|12204|9984|8450|6907|5922|4821|3863|3199|2831""]",[8144],"[""0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",0.498,6.49,,,,0.776,,,"[""ga4gh:VA.uK1rN4IzDI0Zvh80br25DeDGBN-anEAz"",""ga4gh:VA.cwoNG1rC6KFqovc5ui-MtNKiwGolUNME""]","[18173859,18173859]","[18173860,18173860]","[""A"",""C""]","[""C|downstream_gene_variant|MODIFIER||ENSG00000279924|Transcript|ENST00000623078|unprocessed_pseudogene||||||||||1|150|-1||SNV|||YES||||||||Ensembl|||||||||||||""]"
chr13:18173860,"[""A"",""T""]",,,"{""AC0"",""AS_VQSR""}",[0],875186,[0.00e+00],,,,[0],[0.00e+00],426726,[0],[0],[0.00e+00],448460,[0],[0],[0],[0.00e+00],11142,[0],[0],[0.00e+00],8692,[0],[0],[0.00e+00],19834,[0],[0],[0.00e+00],14468,[0],[0],[0.00e+00],11974,[0],[0],[0.00e+00],26442,[0],[0],[0.00e+00],9156,[0],[0],[0.00e+00],10256,[0],[0],[0.00e+00],19412,[0],[0],[0.00e+00],16930,[0],[0],[0.00e+00],16202,[0],[0],[0.00e+00],33132,[0],[0],[0.00e+00],18970,[0],[0],[0.00e+00],17246,[0],[0],[0.00e+00],36216,[0],[0],[0.00e+00],1156,[0],[0],[0.00e+00],1668,[0],[0],[0.00e+00],2824,[0],[0],[0.00e+00],319082,[0],[0],[0.00e+00],314692,[0],[0],[0.00e+00],633774,[0],[0],[0.00e+00],232774,[0],[0],[0.00e+00],274308,[0],[0],[0.00e+00],507082,[0],[0],[0.00e+00],7634,[0],[0],[0.00e+00],5888,[0],[0],[0.00e+00],13522,[0],[0],[0.00e+00],14228,[0],[0],[0.00e+00],11814,[0],[0],[0.00e+00],26042,[0],[0],[0.00e+00],8040,[0],[0],[0.00e+00],9250,[0],[0],[0.00e+00],17290,[0],[0],[0.00e+00],15952,[0],[0],[0.00e+00],15684,[0],[0],[0.00e+00],31636,[0],[0],[0.00e+00],18864,[0],[0],[0.00e+00],17232,[0],[0],[0.00e+00],36096,[0],[0],[0.00e+00],924,[0],[0],[0.00e+00],1214,[0],[0],[0.00e+00],2138,[0],[0],[0.00e+00],141082,[0],[0],[0.00e+00],155202,[0],[0],[0.00e+00],296284,[0],[0],[0.00e+00],628780,[0],[0],[0.00e+00],14336,[0],[0],[0.00e+00],13612,[0],[0],[0.00e+00],27948,[0],[0],[0.00e+00],11714,[0],[0],[0.00e+00],44412,[0],[0],[0.00e+00],56126,[0],[2],[1.45e-06],1380426,[0],[0],[0.00e+00],20742,[0],[0],[0.00e+00],19184,[0],[0],[0.00e+00],39926,[0],[0],[0.00e+00],15080,[0],[0],[0.00e+00],48546,[0],[0],[0.00e+00],63626,[0],,,,,,,,,,,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,,,,,"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],0.0,32.0,-0.253,369,7.85,0.619,0.299,47,False,False,False,False,[0.00e+00],[2.94e+01],[0.00e+00],[1.00e+00],[185],[1.85e+01],[-2.10e-01],"[""32"",""2|3"",""0""]",[1.24e-01],[10],[-1.45e-06],"[""AS_MQ""]",[-2.01e+00],True,False,"""snv""",3,"""multi-snv""",False,False,False,True,False,True,True,"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]","[""0|0|0|0|26958|0|110943|0|268219|0|18209|1|13263|0|0|0|0|0|0|0""]","[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",[0],"[""0|0|134723|73328|39075|27443|23598|21588|19794|17322|14397|12204|9984|8450|6907|5922|4821|3863|3199|2831""]",[8144],"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",0.485,6.34,,,,0.776,,,"[""ga4gh:VA.uK1rN4IzDI0Zvh80br25DeDGBN-anEAz"",""ga4gh:VA.p7PSy61y9TItWInofKjCWHPVJHB4VBBz""]","[18173859,18173859]","[18173860,18173860]","[""A"",""T""]","[""T|downstream_gene_variant|MODIFIER||ENSG00000279924|Transcript|ENST00000623078|unprocessed_pseudogene||||||||||1|150|-1||SNV|||YES||||||||Ensembl|||||||||||||""]"
chr13:18173861,"[""A"",""C""]","""rs1872524921""",,"{""AC0"",""AS_VQSR""}",[0],885180,[0.00e+00],,,,[0],[0.00e+00],432102,[0],[0],[0.00e+00],453078,[0],[0],[0],[0.00e+00],11240,[0],[0],[0.00e+00],8748,[0],[0],[0.00e+00],19988,[0],[0],[0.00e+00],14550,[0],[0],[0.00e+00],12018,[0],[0],[0.00e+00],26568,[0],[0],[0.00e+00],9214,[0],[0],[0.00e+00],10320,[0],[0],[0.00e+00],19534,[0],[0],[0.00e+00],16934,[0],[0],[0.00e+00],16216,[0],[0],[0.00e+00],33150,[0],[0],[0.00e+00],19042,[0],[0],[0.00e+00],17290,[0],[0],[0.00e+00],36332,[0],[0],[0.00e+00],1154,[0],[0],[0.00e+00],1682,[0],[0],[0.00e+00],2836,[0],[0],[0.00e+00],323858,[0],[0],[0.00e+00],318782,[0],[0],[0.00e+00],642640,[0],[0],[0.00e+00],233162,[0],[0],[0.00e+00],274654,[0],[0],[0.00e+00],507816,[0],[0],[0.00e+00],7654,[0],[0],[0.00e+00],5898,[0],[0],[0.00e+00],13552,[0],[0],[0.00e+00],14306,[0],[0],[0.00e+00],11852,[0],[0],[0.00e+00],26158,[0],[0],[0.00e+00],8072,[0],[0],[0.00e+00],9274,[0],[0],[0.00e+00],17346,[0],[0],[0.00e+00],15950,[0],[0],[0.00e+00],15686,[0],[0],[0.00e+00],31636,[0],[0],[0.00e+00],18934,[0],[0],[0.00e+00],17276,[0],[0],[0.00e+00],36210,[0],[0],[0.00e+00],922,[0],[0],[0.00e+00],1220,[0],[0],[0.00e+00],2142,[0],[0],[0.00e+00],141232,[0],[0],[0.00e+00],155350,[0],[0],[0.00e+00],296582,[0],[1],[1.59e-06],628784,[0],[0],[0.00e+00],14348,[0],[0],[0.00e+00],13628,[0],[0],[0.00e+00],27976,[0],[0],[0.00e+00],11744,[0],[0],[0.00e+00],44470,[0],[0],[0.00e+00],56214,[0],[1],[7.22e-07],1384884,[0],[0],[0.00e+00],20936,[0],[0],[0.00e+00],19320,[0],[0],[0.00e+00],40256,[0],[0],[0.00e+00],15174,[0],[0],[0.00e+00],48702,[0],[0],[0.00e+00],63876,[0],,,,,,,,,,,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,,,,,"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],9.64e-16,43.0,-0.313,395,2.78,0.802,0.388,142,False,False,False,False,[0.00e+00],[4.98e+01],[-3.13e-01],[3.13e-02],[100],[2.86e+00],[7.46e-01],"[""108"",""8|6"",""0""]",[2.88e-01],[35],[-7.22e-07],"[""AS_MQRankSum""]",[-2.78e+00],True,False,"""snv""",3,"""multi-snv""",False,False,False,True,False,True,True,"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]","[""0|0|0|0|25166|0|112932|0|270679|0|19335|1|14476|0|0|0|0|0|0|1""]","[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",[0],"[""0|0|137171|75368|39526|27489|23610|21587|19793|17322|14398|12205|9983|8450|6907|5922|4822|3863|3199|2831""]",[8144],"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",0.496,6.47,,,,0.776,,,"[""ga4gh:VA.kzra9ynnIaT0lqYyq206ll__8QjlGZ3x"",""ga4gh:VA.TiyvilJAEXSnB4k19Uyi4Ee3QugHO6oY""]","[18173860,18173860]","[18173861,18173861]","[""A"",""C""]","[""C|downstream_gene_variant|MODIFIER||ENSG00000279924|Transcript|ENST00000623078|unprocessed_pseudogene||||||||||1|149|-1||SNV|||YES||||||||Ensembl|||||||||||||""]"
chr13:18173861,"[""A"",""G""]",,,"{""AS_VQSR""}",[2],885182,[2.26e-06],"[""nfe""]",[5.20e-07],"[""nfe""]",[0],[0.00e+00],432104,[0],[2],[4.41e-06],453078,[0],[0],[0],[0.00e+00],11242,[0],[0],[0.00e+00],8748,[0],[0],[0.00e+00],19990,[0],[0],[0.00e+00],14550,[0],[0],[0.00e+00],12018,[0],[0],[0.00e+00],26568,[0],[0],[0.00e+00],9214,[0],[0],[0.00e+00],10320,[0],[0],[0.00e+00],19534,[0],[0],[0.00e+00],16934,[0],[0],[0.00e+00],16216,[0],[0],[0.00e+00],33150,[0],[0],[0.00e+00],19042,[0],[0],[0.00e+00],17290,[0],[0],[0.00e+00],36332,[0],[0],[0.00e+00],1154,[0],[0],[0.00e+00],1682,[0],[0],[0.00e+00],2836,[0],[0],[0.00e+00],323858,[0],[2],[6.27e-06],318782,[0],[2],[3.11e-06],642640,[0],[0],[0.00e+00],233164,[0],[1],[3.64e-06],274654,[0],[1],[1.97e-06],507818,[0],[0],[0.00e+00],7656,[0],[0],[0.00e+00],5898,[0],[0],[0.00e+00],13554,[0],[0],[0.00e+00],14306,[0],[0],[0.00e+00],11852,[0],[0],[0.00e+00],26158,[0],[0],[0.00e+00],8072,[0],[0],[0.00e+00],9274,[0],[0],[0.00e+00],17346,[0],[0],[0.00e+00],15950,[0],[0],[0.00e+00],15686,[0],[0],[0.00e+00],31636,[0],[0],[0.00e+00],18934,[0],[0],[0.00e+00],17276,[0],[0],[0.00e+00],36210,[0],[0],[0.00e+00],922,[0],[0],[0.00e+00],1220,[0],[0],[0.00e+00],2142,[0],[0],[0.00e+00],141232,[0],[1],[6.44e-06],155350,[0],[1],[3.37e-06],296582,[0],[1],[1.59e-06],628784,[0],[0],[0.00e+00],14348,[0],[0],[0.00e+00],13628,[0],[0],[0.00e+00],27976,[0],[0],[0.00e+00],11744,[0],[0],[0.00e+00],44470,[0],[0],[0.00e+00],56214,[0],[2],[1.44e-06],1384884,[0],[0],[0.00e+00],20936,[0],[0],[0.00e+00],19320,[0],[0],[0.00e+00],40256,[0],[0],[0.00e+00],15174,[0],[0],[0.00e+00],48702,[0],[0],[0.00e+00],63876,[0],[2],[3.11e-06],[642640],[0],"[""nfe""]",[1],[3.37e-06],[296582],[0],,,[3.80e-07],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[5.20e-07],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[1.40e-07],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[1.90e-07],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],[1.90e-07],"[""nfe""]",,,,,"[""0|0|0|0|0|1|0|0|0|0""]",[0],[0],"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],9.64e-16,43.0,-0.313,395,2.78,0.802,0.388,142,False,False,False,False,[3.35e+00],[4.09e+01],[-1.29e+00],[6.88e-01],[143],[4.47e+00],[0.00e+00],"[""108"",""8|7"",""1""]",[1.04e-01],[32],[-1.44e-06],"[""AS_MQRankSum""]",[-2.10e+00],True,False,"""snv""",3,"""multi-snv""",False,False,False,True,False,True,True,"[""0|0|0|0|0|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0""]","[""0|0|0|0|25166|0|112932|0|270679|0|19335|1|14476|0|0|0|0|0|0|2""]","[""0|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",[0],"[""0|0|137171|75368|39526|27489|23610|21588|19793|17322|14398|12205|9983|8450|6907|5922|4822|3863|3199|2831""]",[8144],"[""0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",0.526,6.77,,,,0.776,,,"[""ga4gh:VA.kzra9ynnIaT0lqYyq206ll__8QjlGZ3x"",""ga4gh:VA.TfQYE5bvKpFy6Aoq0WOabnESOsYVZ3DJ""]","[18173860,18173860]","[18173861,18173861]","[""A"",""G""]","[""G|downstream_gene_variant|MODIFIER||ENSG00000279924|Transcript|ENST00000623078|unprocessed_pseudogene||||||||||1|149|-1||SNV|||YES||||||||Ensembl|||||||||||||""]"
chr13:18173861,"[""A"",""T""]",,,"{""AC0"",""AS_VQSR""}",[0],885180,[0.00e+00],,,,[0],[0.00e+00],432102,[0],[0],[0.00e+00],453078,[0],[0],[0],[0.00e+00],11242,[0],[0],[0.00e+00],8748,[0],[0],[0.00e+00],19990,[0],[0],[0.00e+00],14550,[0],[0],[0.00e+00],12018,[0],[0],[0.00e+00],26568,[0],[0],[0.00e+00],9214,[0],[0],[0.00e+00],10320,[0],[0],[0.00e+00],19534,[0],[0],[0.00e+00],16934,[0],[0],[0.00e+00],16216,[0],[0],[0.00e+00],33150,[0],[0],[0.00e+00],19042,[0],[0],[0.00e+00],17290,[0],[0],[0.00e+00],36332,[0],[0],[0.00e+00],1154,[0],[0],[0.00e+00],1682,[0],[0],[0.00e+00],2836,[0],[0],[0.00e+00],323858,[0],[0],[0.00e+00],318782,[0],[0],[0.00e+00],642640,[0],[0],[0.00e+00],233162,[0],[0],[0.00e+00],274654,[0],[0],[0.00e+00],507816,[0],[0],[0.00e+00],7656,[0],[0],[0.00e+00],5898,[0],[0],[0.00e+00],13554,[0],[0],[0.00e+00],14306,[0],[0],[0.00e+00],11852,[0],[0],[0.00e+00],26158,[0],[0],[0.00e+00],8072,[0],[0],[0.00e+00],9274,[0],[0],[0.00e+00],17346,[0],[0],[0.00e+00],15950,[0],[0],[0.00e+00],15686,[0],[0],[0.00e+00],31636,[0],[0],[0.00e+00],18934,[0],[0],[0.00e+00],17276,[0],[0],[0.00e+00],36210,[0],[0],[0.00e+00],922,[0],[0],[0.00e+00],1220,[0],[0],[0.00e+00],2142,[0],[0],[0.00e+00],141232,[0],[0],[0.00e+00],155350,[0],[0],[0.00e+00],296582,[0],[2],[3.18e-06],628784,[0],[0],[0.00e+00],14348,[0],[0],[0.00e+00],13628,[0],[0],[0.00e+00],27976,[0],[0],[0.00e+00],11742,[0],[0],[0.00e+00],44470,[0],[0],[0.00e+00],56212,[0],[2],[1.44e-06],1384884,[0],[0],[0.00e+00],20936,[0],[0],[0.00e+00],19320,[0],[0],[0.00e+00],40256,[0],[0],[0.00e+00],15172,[0],[0],[0.00e+00],48702,[0],[0],[0.00e+00],63874,[0],,,,,,,,,,,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,[0.00e+00],,,,,,,"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],"[""0|0|0|0|0|0|0|0|0|0""]",[0],[0],9.64e-16,43.0,-0.313,395,2.78,0.802,0.388,142,False,False,False,False,[0.00e+00],[4.04e+01],[6.90e-02],[3.13e-02],[152],[2.03e+00],[9.53e-01],"[""108"",""8|11"",""1""]",[2.19e-01],[75],[-1.44e-06],"[""AS_MQRankSum""]",[-3.33e+00],True,False,"""snv""",3,"""multi-snv""",False,False,False,True,False,True,True,"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]","[""0|0|0|0|25166|0|112932|0|270679|0|19335|1|14476|0|0|0|0|0|0|1""]","[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",[0],"[""0|0|137171|75368|39526|27489|23610|21588|19793|17322|14397|12205|9983|8450|6907|5922|4822|3863|3199|2831""]",[8144],"[""0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0""]",0.483,6.33,,,,0.776,,,"[""ga4gh:VA.kzra9ynnIaT0lqYyq206ll__8QjlGZ3x"",""ga4gh:VA.Vr_RVtpmVgy0KXozG5P6yI3ktQtvZN1-""]","[18173860,18173860]","[18173861,18173861]","[""A"",""T""]","[""T|downstream_gene_variant|MODIFIER||ENSG00000279924|Transcript|ENST00000623078|unprocessed_pseudogene||||||||||1|149|-1||SNV|||YES||||||||Ensembl|||||||||||||""]"



## 3. Quality Control
Perform basic QC steps:
- Filter to biallelic sites
- Remove missing genotypes
- Track the number of variants at each step

In [18]:
# log_step("Performing quality control...")

# # Store original counts
# original_rows = mt.count_rows()

# # Filter to biallelic sites
# mt = mt.filter_rows(hl.len(mt.alleles) == 2)
# biallelic_rows = mt.count_rows()

# # Skip the genotype filter since this is a sites-only VCF
# # Instead, let's add some other relevant filters for sites VCFs

# # Filter by quality score (if available)
# mt = mt.filter_rows(mt.qual >= 20.0)
# quality_filtered_rows = mt.count_rows()

# # Display filtering results
# print(f"Filtering results:")
# print(f"- Original variants: {original_rows:,}")
# print(f"- After biallelic filter: {biallelic_rows:,}")
# print(f"- After quality filter: {quality_filtered_rows:,}")
# print(f"- Total variants removed: {original_rows - quality_filtered_rows:,}")


[13:06:18] Performing quality control...
Filtering results:
- Original variants: 3,549,140
- After biallelic filter: 3,549,140
- After quality filter: 0
- Total variants removed: 3,549,140


In [21]:
# log_step("Extracting features from loaded VCF...")

# # First get row data and explicitly preserve keys
# try:
#     features_table = mt.rows().select(
#         # Keep keys
#         **{k: mt.rows()[k] for k in mt.rows().key},
#         # Add other fields
#         rsid = mt.rows().rsid,
#         qual = mt.rows().qual,
#         filters = mt.rows().filters,
#         # Add specific info fields
#         AC = mt.rows().info.AC[0],
#         AN = mt.rows().info.AN,
#         AF = mt.rows().info.AF[0]
#     )

#     print("\nInitial feature extraction successful")
#     count = features_table.count()
#     print(f"Number of variants: {count:,}")

#     # Calculate some basic statistics
#     stats = features_table.aggregate({
#         'mean_qual': hl.agg.stats(features_table.qual),
#         'mean_AF': hl.agg.stats(features_table.AF)
#     })

#     print("\nBasic Statistics:")
#     print(f"Total variants: {count:,}")
    
#     print(f"\nQuality Score Statistics:")
#     print(f"  Mean: {stats['mean_qual'].mean:.2f}")
#     print(f"  Std Dev: {stats['mean_qual'].stdev:.2f}")
#     print(f"  Min: {stats['mean_qual'].min:.2f}")
#     print(f"  Max: {stats['mean_qual'].max:.2f}")

#     print(f"\nAllele Frequency Statistics:")
#     print(f"  Mean: {stats['mean_AF'].mean:.6f}")
#     print(f"  Std Dev: {stats['mean_AF'].stdev:.6f}")
#     print(f"  Min: {stats['mean_AF'].min:.6f}")
#     print(f"  Max: {stats['mean_AF'].max:.6f}")

#     # Show the schema of our final table
#     print("\nFinal table structure:")
#     features_table.describe()

# except Exception as e:
#     print(f"Error in processing: {e}")
#     print("\nDebug info:")
#     print("Keys:", mt.rows().key)
#     print("\nAvailable fields:")
#     print(mt.rows().describe())


[13:09:23] Extracting features from loaded VCF...
Error in processing: 'Table.select': cannot overwrite key field 'locus' with annotate, select or drop; use key_by to modify keys.

Debug info:
Keys: <StructExpression of type struct{locus: locus<GRCh38>, alleles: array<str>}>

Available fields:
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AN: int32, 
        AF: array<float64>, 
        grpmax: array<str>, 
        fafmax_faf95_max: array<float64>, 
        fafmax_faf95_max_gen_anc: array<str>, 
        AC_XX: array<int32>, 
        AF_XX: array<float64>, 
        AN_XX: int32, 
        nhomalt_XX: array<int32>, 
        AC_XY: array<int32>, 
        AF_XY: array<float64>, 
        AN_XY: int32, 
        nhomalt_XY: array<int32>, 
        nhom