# Converting VCF files to .Zarr files on Kodiak

## Data preperation for genome-wide selection scans

Installing necessary packages - scikit-allel

Importing packages

In [2]:
import allel
import sys
print(allel.__version__)

1.3.8


Reading VCF files (Use fields=* to read all the fields of the VCF file)

In [8]:
callset = allel.read_vcf('/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/YmEthInd_Combined/filtered_VCFs/YmEthInd_allChrs_fltpass.vcf', fileds = '*')

Checking the fields

In [9]:
sorted(callset.keys())

['calldata/GT',
 'samples',
 'variants/ALT',
 'variants/CHROM',
 'variants/FILTER_PASS',
 'variants/ID',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

Getting the sample names

In [10]:
callset['samples']

array(['SRR15257906', 'SRR15257907', 'SRR15257908', 'SRR15257909',
       'SRR15257910', 'SRR15257911', 'SRR15257912', 'SRR15257913',
       'SRR15257914', 'SRR15257915', 'SRR15293885', 'SRR15293886',
       'SRR15293887', 'SRR15293888', 'SRR15293889', 'SRR15293890',
       'SRR15293891', 'SRR15293892', 'SRR15293893', 'SRR15293894',
       'X1296', 'X1307', 'X1402', 'X1403', 'X1404', 'X1408', 'X1409',
       'X1410', 'X1415', 'X1416', 'X1417', 'X1419', 'X1420', 'X1421',
       'X1423', 'X1424', 'X1425', 'X1580', 'X1581', 'X1583', 'X1585',
       'X1586', 'X1587', 'X1604', 'X1605', 'X1673', 'X1676', 'X1679',
       'X1680', 'X1735', 'X1736', 'X1738', 'X1740', 'X1742', 'X1743',
       'X1747'], dtype=object)

Getting genotypes

In [14]:
gt = allel.GenotypeArray(callset['calldata/GT'])
gt

Unnamed: 0,0,1,2,3,4,...,51,52,53,54,55,Unnamed: 12
0,./.,./.,./.,1/1,./.,...,./.,1/1,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,0/0,./.,./.,./.,
2,./.,./.,./.,./.,0/0,...,0/0,./.,./.,0/0,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
12860508,./.,./.,0/0,0/0,./.,...,./.,./.,./.,./.,./.,
12860509,./.,./.,0/1,0/0,./.,...,./.,./.,./.,./.,./.,
12860510,./.,./.,0/1,0/0,./.,...,./.,./.,./.,./.,./.,


In [16]:
gt.is_het()
gt.count_het(axis=1)

array([3, 2, 0, ..., 0, 1, 1])

In [17]:
ac = gt.count_alleles()
ac

Unnamed: 0,0,1,2,3,4,5,6,Unnamed: 8
0,13,21,0,0,0,0,0,
1,42,2,0,0,0,0,0,
2,46,2,0,0,0,0,0,
...,...,...,...,...,...,...,...,...
12860508,12,2,0,0,0,0,0,
12860509,9,3,0,0,0,0,0,
12860510,9,3,0,0,0,0,0,


## Reading VCF files to Zarr files
Defining paths for input and output

Installing Zarr Package

In [31]:
!pip install zarr
!pip install numcodecs

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/cm/local/apps/python39/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/cm/local/apps/python39/bin/python3 -m pip install --upgrade pip' command.[0m


Importing Zarr library

In [32]:
import zarr
import numcodecs

In [27]:
vcf_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/YmEthInd_Combined/filtered_VCFs/YmEthInd_allChrs_fltpass.vcf'
zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_allChrs_fltpass.zarr'

Running VCF to Zarr conversion

In [28]:
allel.vcf_to_zarr(vcf_path, zarr_path, fields='*', overwrite =True)

In [33]:
callset_zarr = zarr.open_group('YmEthInd_allChrs_fltpass.zarr', mode='r')
callset_zarr

<zarr.hierarchy.Group '/' read-only>

Exploring the hierarchy of groups and arrays

In [34]:
callset_zarr.tree(expand=True)

ImportError: No module named 'ipytree': Run `pip install zarr[jupyter]` or `conda install ipytree`to get the required ipytree dependency for displaying the tree widget. If using jupyterlab<3, you also need to run `jupyter labextension install ipytree`

/
 ├── calldata
 │   ├── AD (12860511, 56, 4) int16
 │   ├── DP (12860511, 56) int16
 │   ├── GQ (12860511, 56) int8
 │   ├── GT (12860511, 56, 2) int8
 │   ├── MIN_DP (12860511, 56) int32
 │   ├── PGT (12860511, 56) object
 │   ├── PID (12860511, 56) object
 │   ├── PL (12860511, 56, 3) int32
 │   ├── PS (12860511, 56) int32
 │   ├── RGQ (12860511, 56) int32
 │   └── SB (12860511, 56, 4) int32
 ├── samples (56,) object
 └── variants
     ├── AC (12860511, 3) int32
     ├── AF (12860511, 3) float32
     ├── ALT (12860511, 3) object
     ├── AN (12860511,) int32
     ├── AS_BaseQRankSum (12860511, 3) float32
     ├── AS_FS (12860511, 3) float32
     ├── AS_InbreedingCoeff (12860511, 3) float32
     ├── AS_MQ (12860511, 3) float32
     ├── AS_MQRankSum (12860511, 3) float32
     ├── AS_QD (12860511, 3) float32
     ├── AS_ReadPosRankSum (12860511, 3) float32
     ├── AS_SOR (12860511, 3) float32
     ├── BaseQRankSum (12860511,) float32
     ├── CHROM (12860511,) object
     ├── DP (1286

# Importing phased biallelic variants for each chomosome

In [3]:
# Chromosome X
chrx_vcf_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/YmEthInd_Combined/biallelic_phased_VCFs/YmEthInd_NC_050201.1_BA_Phased.vcf.gz.vcf.gz'
chrx_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chrx_BAP.zarr'
allel.vcf_to_zarr(chrx_vcf_path, chrx_zarr_path, fields='*', overwrite =True)

# Chromosome 2
chr2_vcf_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/YmEthInd_Combined/biallelic_phased_VCFs/YmEthInd_NC_050202.1_BA_Phased.vcf.gz.vcf.gz'
chr2_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr2_BAP.zarr'
allel.vcf_to_zarr(chr2_vcf_path, chr2_zarr_path, fields='*', overwrite =True)

# Chromosome 3
chr3_vcf_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/YmEthInd_Combined/biallelic_phased_VCFs/YmEthInd_NC_050203.1_BA_Phased.vcf.gz.vcf.gz'
chr3_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr3_BAP.zarr'
allel.vcf_to_zarr(chr3_vcf_path, chr3_zarr_path, fields='*', overwrite =True)

