In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PARENT_PATH = "/content/drive/MyDrive/CSE-284-Final-Project"

In [None]:
import pandas as pd
import numpy as np

# Load the data from the tsv file
data = pd.read_csv(PARENT_PATH + '/data/igsr_samples.tsv', sep='\t')
data = data[data['Data collections'].str.contains("1000 Genomes phase 3 release", na=False)]

# Group by Population code
grouped = data.groupby(['Population code', 'Sex'])

# Initialize a list to collect the rows for the final DataFrame
rows_list = []

# Iterate through each population group
for (population, sex), group in grouped:
    # Shuffle the samples within the group
    group = group.sample(frac=1, random_state=0).reset_index(drop=True)
    total_samples = len(group)
    # Calculate the number of samples for each split
    train_size = int(total_samples * 0.8)
    val_size = int(total_samples * 0.1)
    # The rest will go to test, ensuring we correctly handle rounding

    # Assign samples to training, validation, and testing sets and collect them
    for idx, row in group.iterrows():
        if idx < train_size:
            split = 0  # Training
        elif idx < train_size + val_size:
            split = 1  # Validation
        else:
            split = 2  # Testing
        rows_list.append({'Sample_name': row['Sample name'],
                          'Population_code': population,
                          'Superpopulation_code': row['Superpopulation code'],
                          'Split': split,
                          'Sex': row['Sex']
                          })

# Create a DataFrame from the collected rows
results_df = pd.DataFrame(rows_list)
results_df

Unnamed: 0,Sample_name,Population_code,Superpopulation_code,Split,Sex
0,HG01883,ACB,AFR,0,female
1,HG02580,ACB,AFR,0,female
2,HG02419,ACB,AFR,0,female
3,HG02095,ACB,AFR,0,female
4,HG02315,ACB,AFR,0,female
...,...,...,...,...,...
3110,NA19186,YRI,AFR,2,male
3111,NA19207,YRI,AFR,2,male
3112,NA19173,YRI,AFR,2,male
3113,NA18521,YRI,AFR,2,male


In [None]:
results_df.to_csv('split_samples.csv', index=False)

In [None]:
! cp split_samples.csv /content/drive/MyDrive/CSE-284-Final-Project/data

In [None]:
! mkdir split_ids
! mkdir split_ids/by_superpopulation
! mkdir split_ids/by_population

In [None]:
sh = """

SPLIT_CSV=/content/split_samples.csv
OUT_DIR=/content/split_ids/by_superpopulation/

for SUPERPOP in "AFR" "AMR" "EAS" "EUR" "SAS"

do

    # Print header
    echo "#FID IID" > ${OUT_DIR}${SUPERPOP}_train.txt
    # Process and filter for training set
    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="0") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${SUPERPOP}_train.txt

    # Print header
    echo "#FID IID" > ${OUT_DIR}${SUPERPOP}_val.txt
    # Process and filter for training set
    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="1") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${SUPERPOP}_val.txt

    # Print header
    echo "#FID IID" > ${OUT_DIR}${SUPERPOP}_test.txt
    # Process and filter for training set
    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="2") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${SUPERPOP}_test.txt

done

"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

In [None]:
# sh = """

# SPLIT_CSV=/content/split_samples.csv
# OUT_DIR=/content/split_ids/by_superpopulation/

# for SUPERPOP in "AFR" "AMR" "EAS" "EUR" "SAS"

# do
#    # For training data
#    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="0") \
#         {print $1}' $SPLIT_CSV > ${OUT_DIR}${SUPERPOP}_train.txt

#    # For validation data
#    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="1") \
#         {print $1}' $SPLIT_CSV > ${OUT_DIR}${SUPERPOP}_val.txt

#    # For test data
#    awk -F"," -v superpop="$SUPERPOP" '($3==superpop && $4=="2") \
#         {print $1}' $SPLIT_CSV > ${OUT_DIR}${SUPERPOP}_test.txt

# done

# """
# with open('script.sh', 'w') as file:
#   file.write(sh)

# ! bash script.sh

In [None]:
sh = """

SPLIT_CSV=/content/split_samples.csv
OUT_DIR=/content/split_ids/by_population/

for POP in "ACB" "ASW" "BEB" "CDX" "CEU" "CHB" "CHS" "CLM" "ESN" "FIN" "GBR" "GIH" "GWD" "IBS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PJL" "PUR" "STU" "TSI" "YRI"

do
    # For all data (no splitting)
    # Print header
    echo "#FID IID" > ${OUT_DIR}${POP}_all.txt
    # Process and filter for training set
    awk -F"," -v pop="$POP" '($2==pop) {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${POP}_all.txt


    # Print header
    echo "#FID IID" > ${OUT_DIR}${POP}_train.txt
    # Process and filter for training set
    awk -F"," -v pop="$POP" '($2==pop && $4=="0") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${POP}_train.txt

    # Print header
    echo "#FID IID" > ${OUT_DIR}${POP}_val.txt
    # Process and filter for training set
    awk -F"," -v pop="$POP" '($2==pop && $4=="1") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${POP}_val.txt

    # Print header
    echo "#FID IID" > ${OUT_DIR}${POP}_test.txt
    # Process and filter for training set
    awk -F"," -v pop="$POP" '($2==pop && $4=="2") {print $3"_"$2, $1}' $SPLIT_CSV >> ${OUT_DIR}${POP}_test.txt

done

"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

In [None]:
! cp -rf split_ids/ /content/drive/MyDrive/CSE-284-Final-Project/data/

In [None]:
! wget https://s3.amazonaws.com/plink2-assets/alpha5/plink2_linux_x86_64_20240105.zip
! unzip plink2_linux_x86_64_20240105.zip
! ./plink2

--2024-03-06 21:26:48--  https://s3.amazonaws.com/plink2-assets/alpha5/plink2_linux_x86_64_20240105.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.136.208, 54.231.225.56, 54.231.163.24, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.136.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9266484 (8.8M) [application/zip]
Saving to: ‘plink2_linux_x86_64_20240105.zip’


2024-03-06 21:26:49 (16.1 MB/s) - ‘plink2_linux_x86_64_20240105.zip’ saved [9266484/9266484]

Archive:  plink2_linux_x86_64_20240105.zip
  inflating: plink2                  
PLINK v2.00a5.10LM 64-bit Intel (5 Jan 2024)   www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --het, --fst, --indep-pa

In [None]:
! cp /content/drive/MyDrive/CSE-284-Final-Project/data/1000g_combined/1000g.* .

In [None]:
! mkdir 1000g_by_superpopulation

In [None]:
sh = """

echo "." > exclude.snps

BFILE=/content/1000g
KEEP_ID_DIR=/content/split_ids/by_superpopulation
OUT_DIR=/content/1000g_by_superpopulation

for SUPERPOP in "AFR" "AMR" "EAS" "EUR" "SAS"

do

    for SPLIT in "train" "val" "test"

    do
        ./plink2 \
            --make-pgen vzs 'pvar-cols=' \
            --keep ${KEEP_ID_DIR}/${SUPERPOP}_${SPLIT}.txt \
            --max-alleles 2 \
            --maf 0.01 \
            --bfile $BFILE \
            --snps-only just-acgt \
            --rm-dup exclude-all \
            --exclude exclude.snps \
            --out ${OUT_DIR}/${SUPERPOP}_${SPLIT}
    done

done

"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

PLINK v2.00a5.10LM 64-bit Intel (5 Jan 2024)   www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /content/1000g_by_superpopulation/AFR_train.log.
Options in effect:
  --bfile /content/1000g
  --exclude exclude.snps
  --keep /content/split_ids/by_superpopulation/AFR_train.txt
  --maf 0.01
  --make-pgen vzs pvar-cols=
  --max-alleles 2
  --out /content/1000g_by_superpopulation/AFR_train
  --rm-dup exclude-all
  --snps-only just-acgt

Start time: Wed Mar  6 23:32:34 2024
12978 MiB RAM detected, ~11546 available; reserving 6489 MiB for main
workspace.
Using up to 2 compute threads.
2504 samples (1270 females, 1234 males; 2497 founders) loaded from
/content/1000g.fam.
12123481 variants loaded from /content/1000g.bim.
Note: No phenotype data present.
--exclude: 12123481 variants remaining.
Note: Skipping --rm-dup since no duplicate IDs are present.
--keep: 523 samples remaining.
523 samples (267 females, 256 males; 520 f

In [None]:
sh = """

echo "." > exclude.snps

BFILE=/content/1000g
KEEP_ID_DIR=/content/split_ids/by_superpopulation
OUT_DIR=/content/1000g_by_superpopulation

for SUPERPOP in "AFR" "AMR" "EAS" "EUR" "SAS"

do

    for SPLIT in "train" "val" "test"

    do
        ./plink2 \
            --make-bed \
            --keep ${KEEP_ID_DIR}/${SUPERPOP}_${SPLIT}.txt \
            --max-alleles 2 \
            --maf 0.01 \
            --bfile $BFILE \
            --snps-only just-acgt \
            --rm-dup exclude-all \
            --exclude exclude.snps \
            --out ${OUT_DIR}/${SUPERPOP}_${SPLIT}
    done

done

"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

PLINK v2.00a5.10LM 64-bit Intel (5 Jan 2024)   www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /content/1000g_by_superpopulation/AFR_train.log.
Options in effect:
  --bfile /content/1000g
  --exclude exclude.snps
  --keep /content/split_ids/by_superpopulation/AFR_train.txt
  --maf 0.01
  --make-bed
  --max-alleles 2
  --out /content/1000g_by_superpopulation/AFR_train
  --rm-dup exclude-all
  --snps-only just-acgt

Start time: Wed Mar  6 23:52:25 2024
12978 MiB RAM detected, ~11537 available; reserving 6489 MiB for main
workspace.
Using up to 2 compute threads.
2504 samples (1270 females, 1234 males; 2497 founders) loaded from
/content/1000g.fam.
12123481 variants loaded from /content/1000g.bim.
Note: No phenotype data present.
--exclude: 12123481 variants remaining.
Note: Skipping --rm-dup since no duplicate IDs are present.
--keep: 523 samples remaining.
523 samples (267 females, 256 males; 520 founders) remaini

In [None]:
! cp -rf 1000g_by_superpopulation/ /content/drive/MyDrive/CSE-284-Final-Project/data/

In [None]:
! mkdir 1000g_by_population

In [95]:
sh = """

echo "." > exclude.snps

BFILE=/content/1000g
KEEP_ID_DIR=/content/split_ids/by_population
OUT_DIR=/content/1000g_by_population

for POP in "ACB" "ASW" "BEB" "CDX" "CEU" "CHB" "CHS" "CLM" "ESN" "FIN" "GBR" "GIH" "GWD" "IBS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PJL" "PUR" "STU" "TSI" "YRI"

    do
        ./plink2 \
            --make-pgen vzs 'pvar-cols=' \
            --keep ${KEEP_ID_DIR}/${POP}_all.txt \
            --max-alleles 2 \
            --maf 0.01 \
            --bfile $BFILE \
            --snps-only just-acgt \
            --rm-dup exclude-all \
            --exclude exclude.snps \
            --out ${OUT_DIR}/${POP}_all
    done


"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

PLINK v2.00a5.10LM 64-bit Intel (5 Jan 2024)   www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /content/1000g_by_population/ACB_all.log.
Options in effect:
  --bfile /content/1000g
  --exclude exclude.snps
  --keep /content/split_ids/by_population/ACB_all.txt
  --maf 0.01
  --make-pgen vzs pvar-cols=
  --max-alleles 2
  --out /content/1000g_by_population/ACB_all
  --rm-dup exclude-all
  --snps-only just-acgt

Start time: Thu Mar  7 00:33:21 2024
12978 MiB RAM detected, ~11692 available; reserving 6489 MiB for main
workspace.
Using up to 2 compute threads.
2504 samples (1270 females, 1234 males; 2497 founders) loaded from
/content/1000g.fam.
12123481 variants loaded from /content/1000g.bim.
Note: No phenotype data present.
--exclude: 12123481 variants remaining.
Note: Skipping --rm-dup since no duplicate IDs are present.
--keep: 96 samples remaining.
96 samples (49 females, 47 males; 96 founders) remaining after m

In [96]:
sh = """

echo "." > exclude.snps

BFILE=/content/1000g
KEEP_ID_DIR=/content/split_ids/by_population
OUT_DIR=/content/1000g_by_population

for POP in "ACB" "ASW" "BEB" "CDX" "CEU" "CHB" "CHS" "CLM" "ESN" "FIN" "GBR" "GIH" "GWD" "IBS" "ITU" "JPT" "KHV" "LWK" "MSL" "MXL" "PEL" "PJL" "PUR" "STU" "TSI" "YRI"

    do
        ./plink2 \
            --make-bed \
            --keep ${KEEP_ID_DIR}/${POP}_all.txt \
            --max-alleles 2 \
            --maf 0.01 \
            --bfile $BFILE \
            --snps-only just-acgt \
            --rm-dup exclude-all \
            --exclude exclude.snps \
            --out ${OUT_DIR}/${POP}_all
    done


"""
with open('script.sh', 'w') as file:
  file.write(sh)

! bash script.sh

PLINK v2.00a5.10LM 64-bit Intel (5 Jan 2024)   www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /content/1000g_by_population/ACB_all.log.
Options in effect:
  --bfile /content/1000g
  --exclude exclude.snps
  --keep /content/split_ids/by_population/ACB_all.txt
  --maf 0.01
  --make-bed
  --max-alleles 2
  --out /content/1000g_by_population/ACB_all
  --rm-dup exclude-all
  --snps-only just-acgt

Start time: Thu Mar  7 01:20:05 2024
12978 MiB RAM detected, ~11879 available; reserving 6489 MiB for main
workspace.
Using up to 2 compute threads.
2504 samples (1270 females, 1234 males; 2497 founders) loaded from
/content/1000g.fam.
12123481 variants loaded from /content/1000g.bim.
Note: No phenotype data present.
--exclude: 12123481 variants remaining.
Note: Skipping --rm-dup since no duplicate IDs are present.
--keep: 96 samples remaining.
96 samples (49 females, 47 males; 96 founders) remaining after main filters.
Cal

In [97]:
! cp -rf 1000g_by_population/ /content/drive/MyDrive/CSE-284-Final-Project/data/