# Extracting whole genome genotype data 

This is a script to extract genotypes from whole genomes of *An.gambiae* collected during the LLINEUP trial that was conducted in Uganda from 2017-2019.  



In [1]:
#Install and load packages
import malariagen_data
import os
import numpy as np
import pandas as pd
import allel
import xarray as xr
import glob

In [None]:

ag3 = malariagen_data.Ag3(pre = True)

In [None]:

#function to extract biallelic genotypes

def extract_and_filter_snps(region, maf_threshold,output_filename):
    
    array_snps = ag3.snp_calls(region=region,
                               sample_sets=["1288-VO-UG-DONNELLY-VMF00168","1288-VO-UG-DONNELLY-VMF00219"],
                               sample_query=("aim_species == 'gambiae'"),
                               site_mask='gamb_colu' )
    
    gt = allel.GenotypeArray(array_snps['call_genotype'])
    
    no_missing = gt.count_missing(1) == 0
    gt_freq=gt.count_alleles().to_frequencies()
    which_pos = (np.max(gt_freq,1) < (1 - maf_threshold)) & no_missing
    gt_filtered = gt[which_pos,:]
   
    gt_biallelic = np.sum(gt_filtered>0,2)
    #convert to dataframe
    df_gt = pd.DataFrame(gt_biallelic)
    pos = array_snps['variant_position'][which_pos]
    
    chrom = np.array(array_snps.contigs)[array_snps['variant_contig']][which_pos]
    #snp_id = np.apply_along_axis(':'.join, 0, [chrom, pos.astype('str')])
    snp_id = np.apply_along_axis(lambda x: np.asarray(':'.join(x), dtype = 'object'), 0, [chrom, pos.values.astype('str')])
    df_gt.set_index(snp_id, inplace = True)
    df_gt.columns=array_snps.sample_id
    # Save DataFrame to CSV file
    df_gt.to_csv(output_filename)
    return(df_gt)
    



In [None]:
output_directory = "/llineup_publication/Data"

regions = ['2L', '2R', '3L', '3R', 'X']

# Dictionary comprehension to call the function for each region
gt = {region: extract_and_filter_snps(region, 0.02, os.path.join(output_directory, f'gt_{region}.csv')) for region in regions}


In [None]:
#Save genotypes

path = "/llineup_publication/Data"
all_csv = glob.glob(path + "**/*gt_*.csv")
df_list = [pd.read_csv(filename, index_col=None) for filename in all_csv]
df_gt = pd.concat(df_list,axis=0, ignore_index=True)
df_gt.to_csv(path+ "/gt_glm.csv")