# HLA-LA dsub

In [None]:
## Package Import
import sys
import os 
import numpy as np
import pandas as pd
from datetime import datetime
import re
import matplotlib.pyplot as plt

In [None]:
## Defining necessary pathways
my_bucket = os.environ['WORKSPACE_BUCKET']

cram_paths = "gs://fc-aou-datasets-controlled/pooled/wgs/cram/v7_delta/"

In [None]:
#this list contains the subset of samples we have typed for our analysis
samples_to_type=pd.read_csv(f'{my_bucket}/data/hla_type_test/cram_lists/typed_samples.txt',header=None)

In [None]:
##Ensuring dsub is up to date
!pip3 install --upgrade dsub

In [None]:
#manifest file to find cram names per sample id
!gsutil -u $GOOGLE_PROJECT cp gs://fc-aou-datasets-controlled/v7/wgs/cram/manifest.csv .

In [None]:
cram_manifest = pd.read_csv('manifest.csv')

In [None]:
#filter for the sample list
cram_manifest_v7 = cram_manifest[cram_manifest['person_id'].isin(samples_to_type[1])]

In [None]:
cram_manifest_v7.reset_index(drop=True,inplace=True)

cram_manifest_v7['cram_uri'].to_csv('AoU_test_crams.txt',index=False, header=None)

In [None]:
## Setting for running dsub jobs
pd.set_option('display.max_colwidth', 0)

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env USER_NAME={USER_NAME}

In [None]:
%%writefile ~/HLA_typing.sh

set -o pipefail
set -o errexit


# ---------Required Inputs---------
# aou_crams - A .txt file containing gs:// paths to cram samples.

# Given a .txt file - get X samples.
# For parallel submissions:
# - Use a different .txt file per submission.
# - Each .txt file can contain a different number of lines
#aou_crams_len=$(wc -l < ${aou_crams})
aou_crams_len=1
echo "Samples in cramlist: ${aou_crams_len}"

# ---------Required Output---------
#filtered_cram_output

echo "GOOGLE_PROJECT: ${GOOGLE_PROJECT}"
echo "OUTPUT_PATH: ${OUTPUT_PATH}"
echo "ref_dict: ${ref_dict}"
echo "ref_fai: ${ref_fai}"
echo "ref_fasta: ${ref_fasta}"

# Perform runs for x samples.
for i in ${aou_crams_1} ${aou_crams_2} ${aou_crams_3} ${aou_crams_4} ${aou_crams_5};
 do
    # These change per iteration
    #export aou_cram_reads=$(sed "${i}!d;q" "${aou_crams}")   # gs:// path to a cram sample
    export aou_cram_reads_name=`basename ${i}`  # file_name.cram
    export aou_cram_reads_prefix="${aou_cram_reads_name%.*}" # file_name
    echo "aou_cram_reads: ${aou_cram_reads}"
    echo "aou_cram_reads_name: ${aou_cram_reads_name}"
    echo "aou_cram_reads_prefix: ${aou_cram_reads_prefix}"
    
    type_hla.sh 8 ${i} /usr/local/bin/Homo_sapiens_assembly38.fasta /usr/local/bin/HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT &&
    ls /usr/local/bin/HLA-LA/working/${aou_cram_reads_prefix};
    ls
    cp ${aou_cram_reads_prefix}_output* ${OUTPUT_PATH}/
 done

In [None]:
!gsutil cp /home/jupyter/HLA_typing.sh {my_bucket}/dsub/scripts/


In [None]:
%%bash
gsutil cp AoU_test_crams.txt $WORKSPACE_BUCKET/data/hla_type_test/


In [None]:
!gsutil cp $WORKSPACE_BUCKET/aou_dsub.bash ~

Dsub: commands run 5 samples at a time

In [None]:
%%bash 

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

# Get all cramlists
bashArray=()

## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
#Change the 'done < test_cram_batch.txt' to 'done < AoU_v7_batches.txt' if you want to run across all batches
while read line; do
  bashArray+=($line)
done < AoU_test_crams.txt
## -------------------------------------------------------------------------------------------------------------

# Length of entire array
len_bashArray=${#bashArray[@]}

LOWER=0
UPPER=3
#$len_bashArray
MACHINE_TYPE="n2-standard-4"
## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
DATE=2024035
BASH_SCRIPT="gs://fc-secure-5d2c6afb-811e-4186-af87-8d68408d1816/dsub/scripts/HLA_typing.sh"
## -------------------------------------------------------------------------------------------------------------
#aou_crams="$(gsutil cat ${bashArray[batch]}| head -1)"
#export aou_cram_reads=$(sed "${i}!d;q" "${aou_crams}")
#export aou_cram_reads_name=`basename ${aou_crams}`
#export aou_cram_reads_prefix="${aou_cram_reads_name%.*}"
#echo "${bashArray[4]}"
#echo "${bashArray[5]}"
array_2=(${bashArray[@]:185:5})
echo ${array_2[@]}

#for ((batch=$LOWER;batch<$UPPER;batch+=1))
#do
dsub \
        --provider google-cls-v2 \
        --user-project "${GOOGLE_PROJECT}"\
        --project "${GOOGLE_PROJECT}"\
        --network "${AOU_NETWORK}" \
        --subnetwork "${AOU_SUBNETWORK}" \
        --service-account "$(gcloud config get-value account)" \
        --user "${DSUB_USER_NAME}" \
        --regions us-central1 \
        --logging "${WORKSPACE_BUCKET}/data/hla_type/logging/" \
        --min-ram 64 \
        --min-cores 8 \
        --boot-disk-size 65 \
        --disk-size 160 \
        --name "${JOB_NAME}_v1" \
        --script "${BASH_SCRIPT}" \
        --image 'gcr.io/hla-la/mdaya-hla-la:latest' \
        --output-recursive OUTPUT_PATH="${WORKSPACE_BUCKET}/data/hla_type_test/" \
        --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
        --input aou_crams_1="${array_2[0]}" \
        --input aou_cram_index_1="${array_2[0]}".crai \
        --input aou_crams_2="${array_2[1]}" \
        --input aou_cram_index_2="${array_2[1]}".crai \
        --input aou_crams_3="${array_2[2]}" \
        --input aou_cram_index_3="${array_2[2]}".crai \
        --input aou_crams_4="${array_2[3]}" \
        --input aou_cram_index_4="${array_2[3]}".crai \
        --input aou_crams_5="${array_2[4]}" \
        --input aou_cram_index_5="${array_2[4]}".crai \

In [None]:
%%bash 

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

# Get all cramlists
bashArray=()

## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
#Change the 'done < test_cram_batch.txt' to 'done < AoU_v7_batches.txt' if you want to run across all batches
while read line; do
  bashArray+=($line)
done < AoU_test_crams.txt
## -------------------------------------------------------------------------------------------------------------

# Length of entire array
len_bashArray=${#bashArray[@]}

LOWER=0
UPPER=3
#$len_bashArray
MACHINE_TYPE="n2-standard-4"
## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
DATE=2024035
BASH_SCRIPT="gs://fc-secure-5d2c6afb-811e-4186-af87-8d68408d1816/dsub/scripts/HLA_typing.sh"
## -------------------------------------------------------------------------------------------------------------
#aou_crams="$(gsutil cat ${bashArray[batch]}| head -1)"
#export aou_cram_reads=$(sed "${i}!d;q" "${aou_crams}")
#export aou_cram_reads_name=`basename ${aou_crams}`
#export aou_cram_reads_prefix="${aou_cram_reads_name%.*}"
#echo "${bashArray[4]}"
#echo "${bashArray[5]}"
array_2=(${bashArray[@]:190:5})
echo ${array_2[@]}

#for ((batch=$LOWER;batch<$UPPER;batch+=1))
#do
dsub \
        --provider google-cls-v2 \
        --user-project "${GOOGLE_PROJECT}"\
        --project "${GOOGLE_PROJECT}"\
        --network "${AOU_NETWORK}" \
        --subnetwork "${AOU_SUBNETWORK}" \
        --service-account "$(gcloud config get-value account)" \
        --user "${DSUB_USER_NAME}" \
        --regions us-central1 \
        --logging "${WORKSPACE_BUCKET}/data/hla_type/logging/" \
        --min-ram 64 \
        --min-cores 8 \
        --boot-disk-size 65 \
        --disk-size 160 \
        --name "${JOB_NAME}_v1" \
        --script "${BASH_SCRIPT}" \
        --image 'gcr.io/hla-la/mdaya-hla-la:latest' \
        --output-recursive OUTPUT_PATH="${WORKSPACE_BUCKET}/data/hla_type_test/" \
        --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
        --input aou_crams_1="${array_2[0]}" \
        --input aou_cram_index_1="${array_2[0]}".crai \
        --input aou_crams_2="${array_2[1]}" \
        --input aou_cram_index_2="${array_2[1]}".crai \
        --input aou_crams_3="${array_2[2]}" \
        --input aou_cram_index_3="${array_2[2]}".crai \
        --input aou_crams_4="${array_2[3]}" \
        --input aou_cram_index_4="${array_2[3]}".crai \
        --input aou_crams_5="${array_2[4]}" \
        --input aou_cram_index_5="${array_2[4]}".crai \

In [None]:
%%bash 

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

# Get all cramlists
bashArray=()

## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
#Change the 'done < test_cram_batch.txt' to 'done < AoU_v7_batches.txt' if you want to run across all batches
while read line; do
  bashArray+=($line)
done < AoU_test_crams.txt
## -------------------------------------------------------------------------------------------------------------

# Length of entire array
len_bashArray=${#bashArray[@]}

LOWER=0
UPPER=3
#$len_bashArray
MACHINE_TYPE="n2-standard-4"
## ------------------------------------------------ MAKE CHANGES HERE ------------------------------------------
DATE=2024035
BASH_SCRIPT="gs://fc-secure-5d2c6afb-811e-4186-af87-8d68408d1816/dsub/scripts/HLA_typing.sh"
## -------------------------------------------------------------------------------------------------------------
#aou_crams="$(gsutil cat ${bashArray[batch]}| head -1)"
#export aou_cram_reads=$(sed "${i}!d;q" "${aou_crams}")
#export aou_cram_reads_name=`basename ${aou_crams}`
#export aou_cram_reads_prefix="${aou_cram_reads_name%.*}"
#echo "${bashArray[4]}"
#echo "${bashArray[5]}"
array_2=(${bashArray[@]:195:5})
echo ${array_2[@]}

#for ((batch=$LOWER;batch<$UPPER;batch+=1))
#do
dsub \
        --provider google-cls-v2 \
        --user-project "${GOOGLE_PROJECT}"\
        --project "${GOOGLE_PROJECT}"\
        --network "${AOU_NETWORK}" \
        --subnetwork "${AOU_SUBNETWORK}" \
        --service-account "$(gcloud config get-value account)" \
        --user "${DSUB_USER_NAME}" \
        --regions us-central1 \
        --logging "${WORKSPACE_BUCKET}/data/hla_type/logging/" \
        --min-ram 64 \
        --min-cores 8 \
        --boot-disk-size 65 \
        --disk-size 160 \
        --name "${JOB_NAME}_v1" \
        --script "${BASH_SCRIPT}" \
        --image 'gcr.io/hla-la/mdaya-hla-la:latest' \
        --output-recursive OUTPUT_PATH="${WORKSPACE_BUCKET}/data/hla_type_test/" \
        --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
        --input aou_crams_1="${array_2[0]}" \
        --input aou_cram_index_1="${array_2[0]}".crai \
        --input aou_crams_2="${array_2[1]}" \
        --input aou_cram_index_2="${array_2[1]}".crai \
        --input aou_crams_3="${array_2[2]}" \
        --input aou_cram_index_3="${array_2[2]}".crai \
        --input aou_crams_4="${array_2[3]}" \
        --input aou_cram_index_4="${array_2[3]}".crai \
        --input aou_crams_5="${array_2[4]}" \
        --input aou_cram_index_5="${array_2[4]}".crai \

In [None]:
#check status; replace 'jobs' with job id
!dstat --provider google-cls-v2 --project terra-vpc-sc-ae994fde --location us-central1 --jobs 'cram-paral--hemanth-karnati--240416-211309-86' --users 'hemanth-karnati' --status '*' -f

In [None]:
%%bash

#collect HLA*LA hla types
gsutil ls $WORKSPACE_BUCKET/data/hla_type_test/*/R1_bestguess_G.txt > hla_samples_typed.txt
gsutil ls $WORKSPACE_BUCKET/data/hla_type_test/*_output_G.txt >> hla_samples_typed.txt
gsutil ls $WORKSPACE_BUCKET/data/hla_type/wgs_*/hla/R1_bestguess_G.txt >> hla_samples_typed.txt

wc hla_samples_typed.txt 

## formatting HLA-LA output

In [None]:
hla_files = pd.read_csv('hla_samples_typed.txt',header=None)

In [None]:
hla_types = pd.DataFrame()
for i in hla_files[0]:
    sample = pd.read_csv(i,sep='\t')
    sample['Locus'] = sample['Locus']+'.'+sample['Chromosome'].astype(str)
    sample['person_id'] = re.search(r'(wgs_[0-9]+)',i)[1].replace('wgs_','')
    type_row = sample.pivot(columns='Locus',values='Allele',index='person_id')
    hla_types = pd.concat([hla_types,type_row])

hla_types.reset_index(inplace=True)

In [None]:
hla_types.to_csv(f'{my_bucket}/data/hla_compare/hla_la_type_table.csv',index=False)

In [None]:
hla_types=pd.read_csv(f'{my_bucket}/data/hla_compare/hla_la_type_table.csv')

In [None]:
#DQ genotypes
hla_types['Dq2_5_cis'] = ((hla_types['DQA1.1'].str.contains('DQA1[*]05:01') | hla_types['DQA1.2'].str.contains('DQA1[*]05:01')) & \
                  (hla_types['DQB1.1'].str.contains('DQB1[*]02:01') | hla_types['DQB1.2'].str.contains('DQB1[*]02:01'))).astype(int)

hla_types['Dq2_5_trans'] = ((hla_types['DQA1.1'].str.contains('DQA1[*]05:05') | hla_types['DQA1.2'].str.contains('DQA1[*]05:05')) & \
                  (hla_types['DQB1.1'].str.contains('DQB1[*]02:02') | hla_types['DQB1.2'].str.contains('DQB1[*]02:02'))).astype(int)

hla_types['Dq2_2'] = ((hla_types['DQA1.1'].str.contains('DQA1[*]02:01') | hla_types['DQA1.2'].str.contains('DQA1[*]02:01')) & \
                 (hla_types['DQB1.1'].str.contains('DQB1[*]02:02') | hla_types['DQB1.2'].str.contains('DQB1[*]02:02'))).astype(int)

hla_types['Dq7_5'] = ((hla_types['DQA1.1'].str.contains('DQA1[*]05:05') | hla_types['DQA1.2'].str.contains('DQA1[*]05:05')) & \
                 (hla_types['DQB1.1'].str.contains('DQB1[*]03:01') | hla_types['DQB1.2'].str.contains('DQB1[*]03:01'))).astype(int)

hla_types['Dq8'] = (((hla_types['DQA1.1'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03')) | (hla_types['DQA1.2'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03'))) & \
                (hla_types['DQB1.1'].str.contains('DQB1[*]03:02') | hla_types['DQB1.2'].str.contains('DQB1[*]03:02'))).astype(int)
hla_types.loc[(hla_types['DQA1.1'].str.contains('DQA1[*]05:01') & hla_types['DQA1.2'].str.contains('DQA1[*]05:01')) & \
              (hla_types['DQB1.1'].str.contains('DQB1[*]02:01') & hla_types['DQB1.2'].str.contains('DQB1[*]02:01')),'Dq2_5_1'] = 2

hla_types.loc[(hla_types['DQA1.1'].str.contains('DQA1[*]02:01') & hla_types['DQA1.2'].str.contains('DQA1[*]02:01')) & \
              (hla_types['DQB1.1'].str.contains('DQB1[*]02:02') & hla_types['DQB1.2'].str.contains('DQB1[*]02:02')),'Dq2_2'] = 2

hla_types.loc[(hla_types['DQA1.1'].str.contains('DQA1[*]05:05') & hla_types['DQA1.2'].str.contains('DQA1[*]05:05')) & \
              (hla_types['DQB1.1'].str.contains('DQB1[*]03:01') & hla_types['DQB1.2'].str.contains('DQB1[*]03:01')),'Dq7_5'] = 2

hla_types.loc[((hla_types['DQA1.1'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03')) & (hla_types['DQA1.2'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03'))) & \
              ((hla_types['DQB1.1'].str.contains('DQB1[*]03:02')) & (hla_types['DQB1.2'].str.contains('DQB1[*]03:02'))),'Dq8'] = 2


In [None]:
hla_types[['DQA1.1','DQA1.2','DQB1.1','DQB1.2']]
hla_types

In [None]:
hla_types['genotype'] = hla_types.apply(lambda row: 
    'DQ2.5/DQ2.5' if row['Dq2_5_cis'] == 2 else
    'DQ2.2/DQ2.2' if row['Dq2_2'] == 2 else
    'DQ7.5/DQ7.5' if row['Dq7_5'] == 2 else
    'DQ8/DQ8' if row['Dq8'] == 2 else
    'DQ2.5 trans' if row['Dq2_5_trans'] == 1 else
    'DQ2.5/DQ2.2' if row['Dq2_5_cis'] == 1 and row['Dq2_2'] == 1 else
    'DQ2.5/DQ7.5' if row['Dq2_5_cis'] == 1 and row['Dq7_5'] == 1 else
    'DQ2.5/DQ8' if row['Dq2_5_cis'] == 1 and row['Dq8'] == 1 else
    'DQ2.5/X' if row['Dq2_5_cis'] == 1 else
    'DQ2.2/DQ7.5' if row['Dq2_2'] == 1 and row['Dq7_5'] == 1 else
    'DQ2.2/DQ8' if row['Dq2_2'] == 1 and row['Dq8'] == 1 else
    'DQ2.2/X' if row['Dq2_2'] == 1 else
    'DQ7.5/DQ8' if row['Dq7_5'] == 1 and row['Dq8'] == 1 else
    'DQ7.5/X' if row['Dq7_5'] == 1 else
    'DQ8/X' if row['Dq8'] == 1 else
    'X/X',
    axis=1
)


In [None]:
XX=hla_types[hla_types['genotype']=='X/X']

In [None]:
hla_files['person_id']=[re.search(r'wgs_[0-9]+',i)[0] for i in hla_files[0]]
hla_files['person_id']=hla_files['person_id'].str.replace('wgs_','')
XX_files = hla_files[hla_files['person_id'].isin(XX['person_id'])]

In [None]:
#quality checks for typed files
k = 0
e = 0
r = 0
q = 0
retry_files = []
propkmer_error_files = []
quality_error_files = []
recheck = []
for i in hla_files[0]:
    df = pd.read_csv(i,sep='\t')
    df.set_index(['Locus','Chromosome'],inplace=True)
    if df.loc[('DQA1',1),'perfectG']!=1:
        print('file ',i,' not perfectG on DQA1.1; use bestguess')
        print('perfectG =', df.loc[('DQA1',1),'perfectG'])
        retry_files.append(i)
        k+=1
    if df.loc[('DQA1',2),'perfectG']!=1:
        print('file ',i,' not perfectG on DQA1.2; use bestguess')
        print('perfectG =', df.loc[('DQA1',2),'perfectG'])
        retry_files.append(i)
        k+=1
    if df.loc[('DQB1',1),'perfectG']!=1:
        print('file ',i,' not perfectG on DQB1.1; use bestguess')
        print('perfectG =', df.loc[('DQB1',1),'perfectG'])
        retry_files.append(i)
        k+=1
    if df.loc[('DQB1',2),'perfectG']!=1:
        print('file ',i,' not perfectG on DQB1.2; use bestguess')
        print('perfectG =', df.loc[('DQB1',2),'perfectG'])
        retry_files.append(i)
        k+=1
    if df.loc[('DQA1',1),'proportionkMersCovered']!=1:
        print('file ',i,' has bad proportionkMersCovered on DQA1.1')
        print('proportionkMersCovered =', df.loc[('DQA1',1),'proportionkMersCovered'])
        propkmer_error_files.append(i) 
        e+=1
    if df.loc[('DQA1',2),'proportionkMersCovered']!=1:
        print('file ',i,' has bad proportionkMersCovered on DQA1.2')
        print('proportionkMersCovered =', df.loc[('DQA1',2),'proportionkMersCovered'])
        propkmer_error_files.append(i) 
        e+=1
    if df.loc[('DQB1',1),'proportionkMersCovered']!=1:
        print('file ',i,' has bad proportionkMersCovered on DQB1.1')
        print('proportionkMersCovered =', df.loc[('DQB1',1),'proportionkMersCovered'])
        propkmer_error_files.append(i) 
        e+=1
    if df.loc[('DQB1',2),'proportionkMersCovered']!=1:
        print('file ',i,' has bad proportionkMersCovered on DQB1.2')
        print('proportionkMersCovered =', df.loc[('DQB1',2),'proportionkMersCovered'])
        propkmer_error_files.append(i) 
        e+=1
    if df.loc[('DQA1',1),'Q1']<=0.99:
        print('file ',i,' has bad Quality on DQA1.1')
        print('Q1 =', df.loc[('DQA1',1),'Q1'])
        quality_error_files.append(i) 
        q+=1
    if df.loc[('DQA1',2),'Q1']<=0.99:
        print('file ',i,' has bad Quality on DQA1.2')
        print('Q1 =', df.loc[('DQA1',2),'Q1'])
        quality_error_files.append(i) 
        q+=1
    if df.loc[('DQB1',1),'Q1']<=0.99:
        print('file ',i,' has bad Quality on DQB1.1')
        print('Q1 =', df.loc[('DQB1',1),'Q1'])
        quality_error_files.append(i) 
        q+=1
    if df.loc[('DQB1',2),'Q1']<=0.99:
        print('file ',i,' has bad Quality on DQB1.2')
        print('Q1 =', df.loc[('DQB1',2),'Q1'])
        quality_error_files.append(i) 
        q+=1
    if df.loc[('DQA1',1),'NColumns_UnaccountedAllele_fGT0.2']!=0:
        print('file ',i,' has unaccounted alleles on DQA1.1')
        print('NColumns_UnaccountedAllele_fGT0.2 =', df.loc[('DQA1',1),'NColumns_UnaccountedAllele_fGT0'])
        error_files.append(i) 
        r+=1
    if df.loc[('DQA1',2),'NColumns_UnaccountedAllele_fGT0.2']!=0:
        print('file ',i,' has unaccounted alleles on DQA1.2')
        print('NColumns_UnaccountedAllele_fGT0.2 =', df.loc[('DQA1',2),'NColumns_UnaccountedAllele_fGT0.2'])
        error_files.append(i) 
        r+=1
    if df.loc[('DQB1',1),'NColumns_UnaccountedAllele_fGT0.2']!=0:
        print('file ',i,' has unaccounted alleles on DQB1.1')
        print('NColumns_UnaccountedAllele_fGT0.2 =', df.loc[('DQB1',1),'NColumns_UnaccountedAllele_fGT0.2'])
        error_files.append(i) 
        r+=1
    if df.loc[('DQB1',2),'NColumns_UnaccountedAllele_fGT0.2']!=0:
        print('file ',i,' has unaccounted alleles on DQB1.2')
        print('NColumns_UnaccountedAllele_fGT0.2 =', df.loc[('DQB1',2),'NColumns_UnaccountedAllele_fGT0.2'])
        error_files.append(i) 
        r+=1

print('retry = ',k)
print('low kmer = ',e)
print('low quality = ',q)
print('recheck = ',r)

In [None]:
non_G_files=[j.replace('_G','') for j in retry_files]

In [None]:
#check on files where perfectG != 1; results in multiple guesses
non_G = pd.DataFrame()
for i in non_G_files:
    df = pd.read_csv(i,sep='\t')
    df['Locus'] = df['Locus']+'.'+df['Chromosome'].astype(str)
    df['person_id']= re.search(r'(wgs_[0-9]+)',i)[1].replace('wgs_','')
    type_row = df.pivot(columns='Locus',values='Allele',index='person_id')
    non_G = pd.concat([non_G,type_row])

In [None]:
non_G['Dq2_5_1'] = ((non_G['DQA1.1'].str.contains('DQA1[*]05:01') | non_G['DQA1.2'].str.contains('DQA1[*]05:01')) & \
                  (non_G['DQB1.1'].str.contains('DQB1[*]02:01') | non_G['DQB1.2'].str.contains('DQB1[*]02:01'))).astype(int)

non_G['Dq2_5_2'] = ((non_G['DQA1.1'].str.contains('DQA1[*]05:05') | non_G['DQA1.2'].str.contains('DQA1[*]05:05')) & \
                  (non_G['DQB1.1'].str.contains('DQB1[*]02:02') | non_G['DQB1.2'].str.contains('DQB1[*]02:02'))).astype(int)

non_G['Dq2_2'] = ((non_G['DQA1.1'].str.contains('DQA1[*]02:01') | non_G['DQA1.2'].str.contains('DQA1[*]02:01')) & \
                 (non_G['DQB1.1'].str.contains('DQB1[*]02:02') | non_G['DQB1.2'].str.contains('DQB1[*]02:02'))).astype(int)

non_G['Dq7_5'] = ((non_G['DQA1.1'].str.contains('DQA1[*]05:05') | non_G['DQA1.2'].str.contains('DQA1[*]05:05')) & \
                 (non_G['DQB1.1'].str.contains('DQB1[*]03:01') | non_G['DQB1.2'].str.contains('DQB1[*]03:01'))).astype(int)

non_G['Dq8'] = (((non_G['DQA1.1'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03')) | (non_G['DQA1.2'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03'))) & \
                (non_G['DQB1.1'].str.contains('DQB1[*]03:02') | non_G['DQB1.2'].str.contains('DQB1[*]03:02'))).astype(int)
non_G.loc[(non_G['DQA1.1'].str.contains('DQA1[*]05:01') & non_G['DQA1.2'].str.contains('DQA1[*]05:01')) & \
              (non_G['DQB1.1'].str.contains('DQB1[*]02:01') & non_G['DQB1.2'].str.contains('DQB1[*]02:01')),'Dq2_5_1'] = 2

non_G.loc[(non_G['DQA1.1'].str.contains('DQA1[*]02:01') & non_G['DQA1.2'].str.contains('DQA1[*]02:01')) & \
              (non_G['DQB1.1'].str.contains('DQB1[*]02:02') & non_G['DQB1.2'].str.contains('DQB1[*]02:02')),'Dq2_2'] = 2

non_G.loc[(non_G['DQA1.1'].str.contains('DQA1[*]05:05') & non_G['DQA1.2'].str.contains('DQA1[*]05:05')) & \
              (non_G['DQB1.1'].str.contains('DQB1[*]03:01') & non_G['DQB1.2'].str.contains('DQB1[*]03:01')),'Dq7_5'] = 2

non_G.loc[((non_G['DQA1.1'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03')) & (non_G['DQA1.2'].str.contains('DQA1[*]03:01|DQA1[*]03:02|DQA1[*]03:03'))) & \
              ((non_G['DQB1.1'].str.contains('DQB1[*]03:02')) & (non_G['DQB1.2'].str.contains('DQB1[*]03:02'))),'Dq8'] = 2


In [None]:
non_G['genotype'] = non_G.apply(lambda row: 
    'DQ2.5/DQ2.5' if row['Dq2_5_1'] == 2 else
    'DQ2.2/DQ2.2' if row['Dq2_2'] == 2 else
    'DQ7.5/DQ7.5' if row['Dq7_5'] == 2 else
    'DQ8/DQ8' if row['Dq8'] == 2 else
    'DQ2.5 trans' if row['Dq2_5_2'] == 1 else
    'DQ2.5/DQ2.2' if row['Dq2_5_1'] == 1 and row['Dq2_2'] == 1 else
    'DQ2.5/DQ7.5' if row['Dq2_5_1'] == 1 and row['Dq7_5'] == 1 else
    'DQ2.5/DQ8' if row['Dq2_5_1'] == 1 and row['Dq8'] == 1 else
    'DQ2.5/X' if row['Dq2_5_1'] == 1 else
    'DQ2.2/DQ7.5' if row['Dq2_2'] == 1 and row['Dq7_5'] == 1 else
    'DQ2.2/DQ8' if row['Dq2_2'] == 1 and row['Dq8'] == 1 else
    'DQ2.2/X' if row['Dq2_2'] == 1 else
    'DQ7.5/DQ8' if row['Dq7_5'] == 1 and row['Dq8'] == 1 else
    'DQ7.5/X' if row['Dq7_5'] == 1 else
    'DQ8/X' if row['Dq8'] == 1 else
    'X/X',
    axis=1
)


In [None]:
non_G.to_csv(f'{my_bucket}/data/hla_compare/hla_types_nonperfectG.csv')

In [None]:
hla_types.to_csv(f'{my_bucket}/data/hla_compare/hla_types_hla_la.csv',index=False)

In [None]:
non_G = pd.read_csv(f'{my_bucket}/data/hla_compare/hla_types_nonperfectG.csv')

## now aggregate 3 methods results

In [None]:
hla_types=pd.read_csv(f'{my_bucket}/data/hla_compare/hla_types_hla_la.csv')
hibag_types = pd.read_csv(f'{my_bucket}/data/hibag_hla/hla_types_with_DQ.csv')

In [None]:
hla_tag = pd.read_csv(f'{my_bucket}/data/hla_compare/dq_haplotypes_tag.csv')

hla_tag.rename({'haplotype':'genotype'},axis=1,inplace=True)

In [None]:
hibag_types.rename({'sample.id':'person_id'},axis=1,inplace=True)

hibag_types.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
hla_la_comp = hla_types[['person_id', 'A.1', 'A.2', 'B.1', 'B.2', 'C.1', 'C.2', 'DPB1.1', 'DPB1.2', 'DQA1.1', 'DQA1.2', 'DQB1.1', 'DQB1.2',
       'DRB1.1', 'DRB1.2','genotype']]

hla_la_comp.columns = 'hla-la ' + hla_la_comp.columns

In [None]:
hla_tag_comp = hla_tag[['person_id','genotype']]

hla_tag_comp.rename({'genotype':'tag genotype'},axis=1,inplace=True)

In [None]:
hibag_comp=hibag_types[['person_id', 'A.1', 'A.2', 'B.1', 'B.2', 'C.1', 'C.2', 'DPB1.1', 'DPB1.2', 'DQA1.1', 'DQA1.2', 'DQB1.1', 'DQB1.2',
       'DRB1.1', 'DRB1.2','genotype']]
hibag_comp.columns = 'hibag ' + hibag_comp.columns

In [None]:
for i in hla_la_comp.columns[1:15]:
    hla_la_comp[i]=[re.search(r'([0-9]+:[0-9]+)',i)[1] for i in hla_la_comp[i]]

In [None]:
compare = pd.merge(hla_tag_comp, hibag_comp, left_on='person_id', right_on='hibag person_id')
compare = pd.merge(compare,hla_la_comp, left_on='person_id', right_on='hla-la person_id',how='left' )
compare.drop(['hla-la person_id','hibag person_id'],axis=1,inplace=True)
compare

In [None]:
compare.to_csv(f'{my_bucket}/data/hla_compare/hla_la_hibag_tag_compare.csv',index=False)
!gsutil cp {my_bucket}/data/hla_compare/hla_la_hibag_tag_compare.csv .