# GWAS via REGENIE for PP

This uses dsub (as opposed to WDL + Cromwell) to submit bash scripts corresponding to REGENIE for GWAS.

Template for code: https://workbench.researchallofus.org/workspaces/aou-rw-5981f9dc/aouldlgwasregeniedsubctv6duplicate/analysis/preview/4.0_regenie_dsub_HP_TM.ipynb 
i.e the GWAS for LDL-C done by Bick et al, 2024

Modified for my applications.

In [1]:
## Python Package Import
import sys
import os 
import numpy as np
import pandas as pd
from datetime import datetime

##Ensuring dsub is up to date
!pip3 install --upgrade dsub



In [2]:
#Defining environment variables
# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_ID={LINE_COUNT_JOB_ID}
## Defining necessary pathways
my_bucket = os.environ['WORKSPACE_BUCKET']
## Setting for running dsub jobs
pd.set_option('display.max_colwidth', 0)

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env USER_NAME={USER_NAME}

env: JOB_ID={LINE_COUNT_JOB_ID}
env: USER_NAME=jon126


## 1.1a PLINK QC Step

This runs the code and submits the job for each chromosome to run the preparatory QC step which is phenotype-agnostic.

In [3]:
## MODIFY FOR FULL DATA RUN 
JOB_NAME='1_regenie_plinkprep'

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

env: JOB_NAME=1_regenie_plinkprep


In [4]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

'gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241015'

In [5]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241015/results
env: OUTPUT_FILES=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241015/results


In [16]:
#This is the plink preparatory in REGENIE 
filename='1_regenie_plinkprep.sh'

script = '''

set -o pipefail 
set -o errexit

plink \
    --bed "${bedfile}" \
    --bim "${bimfile}" \
    --fam "${famfile}" \
    --geno 0.01 \
    --hwe 1e-15 \
    --indep-pairwise 1000 100 0.8 \
    --write-snplist \
    --memory 14000 \
    --out qc_ldpruned_snps_chr"${chr}"

export output_snplist="qc_ldpruned_snps_chr${chr}.snplist" 
export output_ldprune_in="qc_ldpruned_snps_chr${chr}.prune.in" 
export output_ldprune_out="qc_ldpruned_snps_chr${chr}.prune.out"

mv ${output_snplist} ${output_ldprune_in} ${output_ldprune_out} -t ${OUTPUT_PATH}
'''

with open(filename,'w') as fp:
    fp.write(script)


In [17]:
!gsutil cp ./1_regenie_plinkprep.sh {my_bucket}/dsub/scripts/ 

Copying file://./1_regenie_plinkprep.sh [Content-Type=text/x-sh]...
/ [1 files][  527.0 B/  527.0 B]                                                
Operation completed over 1 objects/527.0 B.                                      


In [10]:
#Check the file is there
!gsutil ls {my_bucket}/dsub/scripts/*.sh

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/1_regenie_plinkprep.sh
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/2_regenie_aous_pp.sh


In [9]:
# !gsutil -u $GOOGLE_PROJECT ls -lh gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/

423.72 GiB  2023-09-26T17:40:39Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr1.bed
356.34 MiB  2023-09-26T13:42:19Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr1.bim
  4.45 MiB  2023-09-26T13:41:54Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr1.fam
272.02 GiB  2023-09-26T16:05:59Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr10.bed
238.88 MiB  2023-09-26T13:42:14Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr10.bim
  4.45 MiB  2023-09-26T13:41:52Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr10.fam
253.58 GiB  2023-09-26T15:59:01Z  gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v

In [9]:
%%bash --out LINE_COUNT_JOB_ID
#This submits the job btw for each chromosome!

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

MACHINE_TYPE="n2-standard-4"
BASH_SCRIPT="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/1_regenie_plinkprep.sh" #From above command

# Python is 'right side limited' wherein the last value is not included
# To run the regression across all chromosomes, set lower to 1 and upper to 23
# To run across one chromosome, set lower to the chomosome-of-interest and upper to the following

# LOWER=1
# UPPER=23

#Test on 1 chromosome first
LOWER=12
UPPER=13
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do
    dsub \
    --provider google-cls-v2 \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --image "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.14" \
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    "$@" \
    --preemptible \
    --disk-size 1000 \
    --boot-disk-size 100 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input bedfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bed" \
    --input bimfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bim" \
    --input famfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.fam" \
    --env chr=${chromo} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}"
done

Job properties:
  job-id: 1-regenie---jon126--241010-120423-08
  job-name: 1-regenie-plinkprep
  user-id: jon126
Provider internal-id (operation): projects/282373700333/locations/us-central1/operations/11449612967915098292
Launched job-id: 1-regenie---jon126--241010-120423-08
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs '1-regenie---jon126--241010-120423-08' --users 'jon126' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs '1-regenie---jon126--241010-120423-08' --users 'jon126'


In [7]:
# Check the status of your job submissions
!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status '*' \
    --age 3d

Job Name         Status                                      Last Update
---------------  ------------------------------------------  --------------
1-regenie-pl...  Success                                     10-10 22:13:50
1-regenie-pl...  Success                                     10-10 23:54:04
1-regenie-pl...  Success                                     10-09 18:52:06
1-regenie-pl...  worker was terminated                       10-09 14:31:17
1-regenie-pl...  Success                                     10-09 21:39:26
1-regenie-pl...  worker was terminated                       10-09 17:44:43
1-regenie-pl...  Success                                     10-10 00:47:23
1-regenie-pl...  Success                                     10-08 18:21:34
1-regenie-pl...  Success                                     10-08 18:32:59
1-regenie-pl...  Success                                     10-08 20:02:00
1-regenie-pl...  Success                                     10-08 20:14:18
1-

In [7]:
# Check the status of your job submissions

!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status 'FAILURE' \
    --full \
    --age 2d

- create-time: '2024-10-09 11:43:14.656295'
  dsub-version: v0-5-0
  end-time: '2024-10-09 14:31:17.649095'
  envs:
    GOOGLE_PROJECT: terra-vpc-sc-4e1b6fe8
    chr: '12'
  events:
  - name: start
    start-time: 2024-10-09 11:43:21.506790+00:00
  - name: pulling-image
    start-time: 2024-10-09 11:43:55.122212+00:00
  - name: localizing-files
    start-time: 2024-10-09 11:57:43.376502+00:00
  - name: running-docker
    start-time: 2024-10-09 12:49:15.598245+00:00
  - name: 'Execution failed: worker was terminated'
    start-time: 2024-10-09 14:31:17.112276+00:00
  input-recursives: {}
  inputs:
    bedfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr12.bed
    bimfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr12.bim
    famfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr1

In [7]:
#Look at log file
# !gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/1-regenie-plinkprep/jon126/20241008/150958/1-regenie---jon126--241008-150959-63-task-None.log ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/1-regenie-plinkprep/jon126/20241008/150958/1-regenie---jon126--241008-150959-63-task-None.log...
/ [1 files][ 77.2 KiB/ 77.2 KiB]                                                
Operation completed over 1 objects/77.2 KiB.                                     


In [15]:
#Cancel running jobs
# !ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --jobs '*' --users 'jon126' --location us-central1

Delete running jobs:
  user:
    {'jon126'}

  job-id:
    ['*']

Found 20 tasks to delete.
20 jobs deleted


In [16]:
#Move all the plinkprep output folders to the same parent folder in gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/

# !gsutil -m mv -r gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241008/* gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/
# !gsutil -m mv -r gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/* gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/
# !gsutil -m mv -r gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241010/* gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/results/10/qc_ldpruned_snps_chr10.prune.in [Content-Type=application/octet-stream]...
Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/results/10/qc_ldpruned_snps_chr10.prune.out [Content-Type=application/octet-stream]...
Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/results/10/qc_ldpruned_snps_chr10.snplist [Content-Type=application/octet-stream]...
Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/results/17/qc_ldpruned_snps_chr17.prune.in [Content-Type=application/octet-stream]...
Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/20241009/results/17/qc_ldpruned_snps_chr17.prune.out [Content-Type=application/octet-stream]...
Copying gs://fc-secure-7

In [19]:
#Evaluate the prune.in file from plinkprep step

!gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/21/qc_ldpruned_snps_chr21.prune.in ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/21/qc_ldpruned_snps_chr21.prune.in...
/ [1 files][  9.4 MiB/  9.4 MiB]                                                
Operation completed over 1 objects/9.4 MiB.                                      


## 1.1b Plinkprep2 Step

This details how you need to extract a follow-up snplist that uses to filter only for LDpruned variants which have a minMAC of 50 in the individuals with non-NA value for that phenotype.

50/10000 translates to about ~0.5% MAF so reasonable filter.

In [9]:
## MODIFY FOR FULL DATA RUN 
JOB_NAME='plinkprep2'
PHENOTYPE='NTproBNP'
PHENO_FILEPATH="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv"

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}
%env PHENOTYPE={PHENOTYPE}
%env PHENO_FILEPATH={PHENO_FILEPATH}

env: JOB_NAME=plinkprep2
env: PHENOTYPE=NTproBNP
env: PHENO_FILEPATH=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv


In [10]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    PHENOTYPE,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/plinkprep2/NTproBNP/jon126/20241022/results
env: OUTPUT_FILES=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/plinkprep2/NTproBNP/jon126/20241022/results


In [16]:
#This is the plink preparatory in REGENIE 
filename='1_regenie_plinkprep2.sh'

script = '''
set -o pipefail 
set -o errexit

awk -v var="${pheno}" -F'\t' 'NR==1 {for (i=1; i<=NF; i++) if ($i == var) col=i} NR > 1 && $col != "NA" {print $1, $2}' "${pheno_file}" > nonNA_plinkids.txt

plink \
    --bed "${bedfile}" \
    --bim "${bimfile}" \
    --fam "${famfile}" \
    --keep nonNA_plinkids.txt \
    --extract "${step1_snplist}" \
    --write-snplist \
    --mac 50 \
    --memory 14000 \
    --out "${pheno}"_qc_ldpruned_MAC50filtered_chr"${chr}"

export output_snplist="${pheno}_qc_ldpruned_MAC50filtered_chr${chr}.snplist"
mv ${output_snplist} -t ${OUTPUT_PATH}
'''

with open(filename,'w') as fp:
    fp.write(script)

#Upload to GCP Bucket
!gsutil cp ./1_regenie_plinkprep2.sh {my_bucket}/dsub/scripts/

Copying file://./1_regenie_plinkprep2.sh [Content-Type=text/x-sh]...
/ [1 files][  557.0 B/  557.0 B]                                                
Operation completed over 1 objects/557.0 B.                                      


In [26]:
%%bash --out LINE_COUNT_JOB_ID
#This submits the job btw for each chromosome!

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

MACHINE_TYPE="n2-standard-4"
BASH_SCRIPT="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/1_regenie_plinkprep2.sh" #From above command

# Python is 'right side limited' wherein the last value is not included
# To run the regression across all chromosomes, set lower to 1 and upper to 23
# To run across one chromosome, set lower to the chomosome-of-interest and upper to the following

# LOWER=1
# UPPER=23

#Test on 1 chromosome first
LOWER=1
UPPER=23
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do
    dsub \
    --provider google-cls-v2 \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --image "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.14" \
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    "$@" \
    --preemptible \
    --disk-size 1000 \
    --boot-disk-size 100 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input bedfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bed" \
    --input bimfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bim" \
    --input famfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.fam" \
    --input step1_snplist="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/${chromo}/qc_ldpruned_snps_chr${chromo}.prune.in" \
    --input pheno_file="${PHENO_FILEPATH}" \
    --env chr=${chromo} \
    --env pheno=${PHENOTYPE} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}"
done

#N.B Make sure there are no spaces after the \ otherwise the dsub script breaks

Job properties:
  job-id: plinkprep2--jon126--241022-154010-03
  job-name: plinkprep2
  user-id: jon126
Provider internal-id (operation): projects/282373700333/locations/us-central1/operations/5677982850354247549
Launched job-id: plinkprep2--jon126--241022-154010-03
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs 'plinkprep2--jon126--241022-154010-03' --users 'jon126' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs 'plinkprep2--jon126--241022-154010-03' --users 'jon126'
Job properties:
  job-id: plinkprep2--jon126--241022-154012-15
  job-name: plinkprep2
  user-id: jon126
Provider internal-id (operation): projects/282373700333/locations/us-central1/operations/10068118253030462636
Launched job-id: plinkprep2--jon126--241022-154012-15
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location

In [29]:
# Check the status of your job submissions
!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status '*' \
    --age 1d

Job Name         Status                                      Last Update
---------------  ------------------------------------------  --------------
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:24
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:24
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:30
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:27
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:47
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:26
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:13
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:13
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:15
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:24
plinkprep2       Pulling "us.gcr.io/broad-dsp-gcr-public...  10-22 15:41:12
pl

In [27]:
# Check the status of your job submissions
!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status 'SUCCESS' \
    --full \
    --age 1d

- create-time: '2024-10-22 14:35:28.001853'
  dsub-version: v0-5-0
  end-time: '2024-10-22 15:20:00.342526'
  envs:
    GOOGLE_PROJECT: terra-vpc-sc-4e1b6fe8
    chrom: '21'
    pheno: NTproBNP
  events:
  - name: start
    start-time: 2024-10-22 14:35:40.001515+00:00
  - name: pulling-image
    start-time: 2024-10-22 14:36:18.962501+00:00
  - name: localizing-files
    start-time: 2024-10-22 14:51:45.029876+00:00
  - name: running-docker
    start-time: 2024-10-22 15:13:39.643414+00:00
  - name: delocalizing-files
    start-time: 2024-10-22 15:19:55.774982+00:00
  - name: ok
    start-time: 2024-10-22 15:20:00.342526+00:00
  input-recursives: {}
  inputs:
    bedfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr21.bed
    bimfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr21.bim
    famfile: gs://fc-aou-datasets-controlled/v7/wgs/short

In [28]:
!gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/plinkprep2/jon126/20241022/143526/plinkprep2--jon126--241022-143527-86-task-None.log ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/plinkprep2/jon126/20241022/143526/plinkprep2--jon126--241022-143527-86-task-None.log...
/ [1 files][ 16.1 KiB/ 16.1 KiB]                                                
Operation completed over 1 objects/16.1 KiB.                                     


In [15]:
#Cancel running jobs
# !ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --jobs '*' --users 'jon126' --location us-central1

Delete running jobs:
  user:
    {'jon126'}

  job-id:
    ['*']

Found 1 tasks to delete.
1 job deleted


In [25]:
!gsutil ls gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/plinkprep2/NTproBNP/jon126/20241022/results

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/plinkprep2/NTproBNP/jon126/20241022/results/NTproBNP_qc_ldpruned_MAC50filtered_chr.snplist
CommandException: "rm" command does not support "file://" URLs. Did you mean to use a gs:// URL?


## 1.2 REGENIE HCM GWAS Step

In [3]:
## MODIFY FOR FULL DATA RUN 
JOB_NAME='REGENIE_hcm'
PHENOTYPE='hcm'
TRAIT_TYPE='cc'
PHENO_FILEPATH="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_pheno.tsv"
COV_FILEPATH="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_cov.tsv"

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}
%env PHENOTYPE={PHENOTYPE}
%env TRAIT_TYPE={TRAIT_TYPE}
%env PHENO_FILEPATH={PHENO_FILEPATH}
%env COV_FILEPATH={COV_FILEPATH}

env: JOB_NAME=REGENIE_hcm
env: PHENOTYPE=hcm
env: TRAIT_TYPE=cc
env: PHENO_FILEPATH=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_pheno.tsv
env: COV_FILEPATH=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_cov.tsv


In [27]:
#Check the pheno and covar files

# !gsutil cp 'gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_pheno.tsv' ./
# !gsutil cp 'gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_cov.tsv' ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_pheno.tsv...
/ [1 files][  5.2 MiB/  5.2 MiB]                                                
Operation completed over 1 objects/5.2 MiB.                                      
Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/hcm_regenie_cov.tsv...
\ [1 files][ 62.9 MiB/ 62.9 MiB]                                                
Operation completed over 1 objects/62.9 MiB.                                     


In [4]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

'gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_hcm/jon126/20241021'

In [5]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_hcm/jon126/20241021/results
env: OUTPUT_FILES=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_hcm/jon126/20241021/results


In [45]:
#This is the actual REGENIE bash script
#Note that we assume no filtering of variants in the REGENIE script because we assume that the acaf_threshold nature of it has filtered the variants.
filename2='2_regenie_aous_hcm.sh'

script = '''
set -o pipefail 
set -o errexit

#This defines the actual bed_prefix, assuming localisation of the input bed/bim/fam files

echo "${bedfile}"
echo "${bimfile}"
echo "${famfile}"

bed_prefix=/mnt/data/input/gs/fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr"${chrom}" 

regenie \
    --step 1 \
    --bed "${bed_prefix}" \
    --phenoFile "${pheno_file}" \
    --phenoCol "${pheno}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --covarColList "age,ht,wt,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10" \
    --bsize 1000 \
    --minMAC 50 \
    --extract "${step1_snplist}" \
    --verbose \
    --"${trait}" \ 
    --ref-first \
    --out "${pheno}"_step1_chr"${chrom}"

#regenie pt 2
regenie \
    --step 2 \
    --bed "${bed_prefix}" \
    --phenoFile "${pheno_file}" \
    --phenoCol "${pheno}" \
    --covarFile "${cov_file}" \
    --catCovarList sex \
    --covarColList "age,ht,wt,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10" \
    --pred "${pheno}"_step1_chr"${chrom}"_pred.list \
    --bsize 1000 \
    --minMAC 50 \
    --verbose \
    --"${trait}" \
    --ref-first \
    --out "${pheno}"_step2_chr"${chrom}"

export regenie_results=${pheno}_step2_chr"{chrom}".regenie
mv ${regenie_results} -t ${OUTPUT_PATH}
'''

with open(filename2,'w') as fp:
    fp.write(script)

In [46]:
#Upload to GCP Bucket
!gsutil cp ./2_regenie_aous_hcm.sh {my_bucket}/dsub/scripts/

Copying file://./2_regenie_aous_hcm.sh [Content-Type=text/x-sh]...
/ [1 files][  1.2 KiB/  1.2 KiB]                                                
Operation completed over 1 objects/1.2 KiB.                                      


In [None]:
#Check the files are there
!gsutil ls {my_bucket}/dsub/scripts/*.sh

In [None]:
%%bash --out LINE_COUNT_JOB_ID
#This submits the job btw for each chromosome!

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

MACHINE_TYPE="n2-standard-4"
BASH_SCRIPT="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/2_regenie_aous_hcm.sh" #From above command

# Python is 'right side limited' wherein the last value is not included
# To run the regression across all chromosomes, set lower to 1 and upper to 23
# To run across one chromosome, set lower to the chomosome-of-interest and upper to the following

# LOWER=1
# UPPER=23

#Test on 1 chromosome first
LOWER=21
UPPER=22
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do
    dsub \
    --provider google-cls-v2 \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --image "${ARTIFACT_REGISTRY_DOCKER_REPO}/skoyamamd/regenie_3.4.1:latest" \
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    "$@" \
    --preemptible \
    --disk-size 1000 \
    --boot-disk-size 100 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input bedfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bed" \
    --input bimfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bim" \
    --input famfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.fam" \
    --input pheno_file="${PHENO_FILEPATH}" \
    --input cov_file="${COV_FILEPATH}" \
    --input step1_snplist="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/${chromo}/qc_ldpruned_snps_chr${chromo}.prune.in" \
    --env trait=${TRAIT_TYPE} \
    --env chrom=${chromo} \
    --env pheno=${PHENOTYPE} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}"
done

#N.B Make sure there are no spaces after the \ otherwise the dsub script breaks

## REGENIE  PP GWAS Step

### DSub Parameter Setting

This requires modification for each different phenotype you run.

In [14]:
## MODIFY FOR FULL DATA RUN 
JOB_NAME='REGENIE_ntprobnp'
PHENOTYPE='NTproBNP' #NTproBNP or NPPB or TNNI3 or TNNT2
TRAIT_TYPE='qt'
PHENO_FILEPATH="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv"
COV_FILEPATH="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_cov.tsv"

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}
%env PHENOTYPE={PHENOTYPE}
%env TRAIT_TYPE={TRAIT_TYPE}
%env PHENO_FILEPATH={PHENO_FILEPATH}
%env COV_FILEPATH={COV_FILEPATH}

env: JOB_NAME=REGENIE_ntprobnp
env: PHENOTYPE=NTproBNP
env: TRAIT_TYPE=qt
env: PHENO_FILEPATH=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv
env: COV_FILEPATH=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_cov.tsv


In [25]:
#Check the .fam file to get the correct FID and IID format 

# !gsutil -u $GOOGLE_PROJECT cp gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr21.fam ./

# !gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv ./
# !gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_cov.tsv ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_cov.tsv...
- [1 files][ 59.3 MiB/ 59.3 MiB]                                                
Operation completed over 1 objects/59.3 MiB.                                     


In [15]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

'gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_ntprobnp/jon126/20241021'

In [16]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_ntprobnp/jon126/20241021/results
env: OUTPUT_FILES=gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/REGENIE_ntprobnp/jon126/20241021/results


### REGENIE Bash Script

This details and writes out a .sh script in the local Jupyter disk and then uploads it to GCP Bucket in order for dsub to run it.

In [47]:
#This is the actual REGENIE bash script
filename2='2_regenie_aous_pp.sh'

script = '''
set -o pipefail 
set -o errexit

#This defines the actual bed_prefix, assuming localisation of the input bed/bim/fam files

echo "${bedfile}"
echo "${bimfile}"
echo "${famfile}"

bed_prefix=/mnt/data/input/gs/fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr"${chrom}" 

regenie \
    --step 1 \
    --bed "${bed_prefix}" \
    --phenoFile "${pheno_file}" \
    --phenoCol "${pheno}" \
    --covarFile "${cov_file}" \
    --catCovarList "sex" \
    --covarColList "ht,wt,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,age_${pheno}" \
    --bsize 1000 \
    --minMAC 50 \
    --extract "${step1_snplist}" \
    --verbose \
    --"${trait}" \
    --apply-rint \
    --ref-first \
    --out "${pheno}"_step1_chr"${chrom}"

#regenie pt 2
regenie \
    --step 2 \
    --bed "${bed_prefix}" \
    --phenoFile "${pheno_file}" \
    --phenoCol "${pheno}" \
    --covarFile "${cov_file}" \
    --catCovarList "sex" \
    --covarColList "ht,wt,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,age_${pheno}" \
    --pred "${pheno}"_step1_chr"${chrom}"_pred.list \
    --bsize 1000 \
    --minMAC 50 \
    --verbose \
    --"${trait}" \
    --apply-rint \
    --ref-first \
    --out "${pheno}"_step2_chr"${chrom}"

export regenie_results="${pheno}"step2_chr"{chrom}".regenie
mv ${regenie_results} -t ${OUTPUT_PATH}
'''

with open(filename2,'w') as fp:
    fp.write(script)

In [48]:
#Upload to GCP Bucket
!gsutil cp ./2_regenie_aous_pp.sh {my_bucket}/dsub/scripts/

Copying file://./2_regenie_aous_pp.sh [Content-Type=text/x-sh]...
/ [1 files][  1.2 KiB/  1.2 KiB]                                                
Operation completed over 1 objects/1.2 KiB.                                      


In [35]:
#Check the script files are there
!gsutil ls {my_bucket}/dsub/scripts/*.sh

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/1_regenie_plinkprep.sh
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/2_regenie_aous_pp.sh


In [None]:
#Check for the input files
!gsutil ls {my_bucket}/PP/DATA/*.tsv

In [21]:
#Check for the ldpruned snplist files
!gsutil ls -r {my_bucket}/dsub/results/1_regenie_plinkprep/jon126/results/

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/:

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/1/:
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/1/qc_ldpruned_snps_chr1.prune.in
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/1/qc_ldpruned_snps_chr1.prune.out
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/1/qc_ldpruned_snps_chr1.snplist

gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/10/:
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/10/qc_ldpruned_snps_chr10.prune.in
gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/10/qc_ldpruned_snps_chr10.prune.out
gs://fc-secure-79

### Dsub Submission Script for REGENIE

Note that all `--input` files have to be in double quotations 
whereas all `--env` environmental variables (for dsub) are NOT in quotations e.g `--env chrom=${chromo}` so if you need to use a Bash environmental variable for `--env` (in dsub) you remove the double quotes e.g `--env GOOGLE_PROJECT=${GOOGLE_PROJECT}`

Bash environmental variables are in `"${...}"` format 

In [18]:
!echo "${ARTIFACT_REGISTRY_DOCKER_REPO}" #Base location for public Docker images via Dockerhub

us-central1-docker.pkg.dev/all-of-us-rw-prod/aou-rw-gar-remote-repo-docker-prod


In [49]:
%%bash --out LINE_COUNT_JOB_ID
#This submits the job btw for each chromosome!

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

MACHINE_TYPE="n2-standard-4"
BASH_SCRIPT="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/scripts/2_regenie_aous_pp.sh" #From above command

# Python is 'right side limited' wherein the last value is not included
# To run the regression across all chromosomes, set lower to 1 and upper to 23
# To run across one chromosome, set lower to the chomosome-of-interest and upper to the following

# LOWER=1
# UPPER=23

#Test on 1 chromosome first
LOWER=21
UPPER=22
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do
    dsub \
    --provider google-cls-v2 \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --image "${ARTIFACT_REGISTRY_DOCKER_REPO}/skoyamamd/regenie_3.4.1:latest" \
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    "$@" \
    --preemptible \
    --disk-size 1000 \
    --boot-disk-size 100 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input bedfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bed" \
    --input bimfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.bim" \
    --input famfile="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr${chromo}.fam" \
    --input pheno_file="${PHENO_FILEPATH}" \
    --input cov_file="${COV_FILEPATH}" \
    --input step1_snplist="gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/results/1_regenie_plinkprep/jon126/results/${chromo}/qc_ldpruned_snps_chr${chromo}.prune.in" \
    --env trait=${TRAIT_TYPE} \
    --env chrom=${chromo} \
    --env pheno=${PHENOTYPE} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}"
done

#N.B Make sure there are no spaces after the \ otherwise the dsub script breaks

Job properties:
  job-id: regenie-nt--jon126--241021-172244-27
  job-name: regenie-ntprobnp
  user-id: jon126
Provider internal-id (operation): projects/282373700333/locations/us-central1/operations/1260923833003924352
Launched job-id: regenie-nt--jon126--241021-172244-27
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs 'regenie-nt--jon126--241021-172244-27' --users 'jon126' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --location us-central1 --jobs 'regenie-nt--jon126--241021-172244-27' --users 'jon126'


In [52]:
# Check the status of your job submissions

!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status '*' \
    --age 1d
   # --full

Job Name         Status                                      Last Update
---------------  ------------------------------------------  --------------
regenie-ntpr...  Stopped running "user-command": exit st...  10-21 17:49:21
regenie-ntpr...  Stopped running "user-command": exit st...  10-21 16:37:43
regenie-ntpr...  Stopped running "user-command": exit st...  10-21 14:49:37
regenie-ntpr...  Stopped running "user-command": exit st...  10-21 14:14:48
regenie-ntpr...  Stopped running "user-command": exit st...  10-20 22:17:06



In [53]:
# Check the status of your job submissions
!dstat \
    --provider google-cls-v2 \
    --project terra-vpc-sc-4e1b6fe8 \
    --location us-central1 \
    --jobs '*' \
    --users 'jon126' \
    --status 'FAILURE' \
    --full \
    --age 1d

- create-time: '2024-10-21 17:22:44.411188'
  dsub-version: v0-5-0
  end-time: '2024-10-21 17:49:21.447291'
  envs:
    GOOGLE_PROJECT: terra-vpc-sc-4e1b6fe8
    chrom: '21'
    pheno: NTproBNP
    trait: qt
  events:
  - name: start
    start-time: 2024-10-21 17:22:55.073278+00:00
  - name: pulling-image
    start-time: 2024-10-21 17:23:41.082349+00:00
  - name: localizing-files
    start-time: 2024-10-21 17:24:21.676255+00:00
  - name: running-docker
    start-time: 2024-10-21 17:48:51.181046+00:00
  - name: fail
    start-time: 2024-10-21 17:49:18.695973+00:00
  - name: 'Execution failed: generic::failed_precondition: while running "user-command":
      unexpected exit status 1 was not ignored'
    start-time: 2024-10-21 17:49:21.049400+00:00
  input-recursives: {}
  inputs:
    bedfile: gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr21.bed
    bimfile: gs://fc-aou-datasets-controlled/v7/wgs/short_r

In [54]:
!gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/regenie-ntprobnp/jon126/20241021/172243/regenie-nt--jon126--241021-172244-27-task-None.log ./
    
# !gsutil cp gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/PP/DATA/pp_regenie_pheno.tsv ./

Copying gs://fc-secure-7953e92c-a6a6-42df-9f19-86d553a9044f/dsub/logs/regenie-ntprobnp/jon126/20241021/172243/regenie-nt--jon126--241021-172244-27-task-None.log...
/ [1 files][ 13.8 KiB/ 13.8 KiB]                                                
Operation completed over 1 objects/13.8 KiB.                                     


In [37]:
# !ddel --provider google-cls-v2 --project terra-vpc-sc-4e1b6fe8 --jobs '*' --users 'jon126' --location us-central1

Delete running jobs:
  user:
    {'jon126'}

  job-id:
    ['*']

Found 1 tasks to delete.
1 job deleted
