## Description
1. Subset to desired dataframes
2. LD clumping via PLINK
3. Take the union of the SNPs

In [9]:
import pandas as pd
import numpy as np

In [10]:
df1 = pd.read_csv("../1_CAD/CAD_clean.csv")

# Step 3: Select and reorder desired columns
df1_subset = pd.DataFrame({
    "SNP": df1["SNP"],
    "CHR": df1["CHR"],
    "BP": df1["BP"],
    "P": df1["P"]
})

df1_subset.to_csv("CAD_clean_clump.txt", sep="\t", index=False)

In [11]:
df1_subset.head()

Unnamed: 0,SNP,CHR,BP,P
0,rs143225517,1,751756,0.452802
1,rs3094315,1,752566,0.73946
2,rs3131972,1,752721,0.846265
3,rs3131971,1,752894,0.775066
4,rs61770173,1,753405,0.706526


In [12]:
df2 = pd.read_csv("../2_MDD/MDD_clean.csv")

# Step 3: Select and reorder desired columns
df2_subset = pd.DataFrame({
    "SNP": df2["SNP"],
    "CHR": df2["CHR"],
    "BP": df2["BP"],
    "P": df2["P"]
})

df2_subset.to_csv("MDD_clean_clump.txt", sep="\t", index=False)

In [13]:
df2_subset.head()

Unnamed: 0,SNP,CHR,BP,P
0,rs12238997,1,693731,0.9284
1,rs4951859,1,729679,0.8311
2,rs148120343,1,730087,0.3532
3,rs142557973,1,731718,0.976
4,rs141242758,1,734349,0.9207


## PLINK

In [14]:
%%bash

module load PLINK/1

plink --bfile '/gpfs/gibbs/project/bdsi/shared/Genetics/data/sim_data/geno_data/EUR' \
      --clump CAD_clean_clump.txt \
      --clump-p1 1e-05 \
      --clump-p2 1 \
      --clump-r2 0.1 \
      --clump-kb 250 \
      --out ./results/CAD_clumped_results

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ./results/CAD_clumped_results.log.
Options in effect:
  --bfile /gpfs/gibbs/project/bdsi/shared/Genetics/data/sim_data/geno_data/EUR
  --clump CAD_clean_clump.txt
  --clump-kb 250
  --clump-p1 1e-05
  --clump-p2 1
  --clump-r2 0.1
  --out ./results/CAD_clumped_results

515691 MB RAM detected; reserving 257845 MB for main workspace.
1175356 variants loaded from .bim file.
10000 people (0 males, 0 females, 10000 ambiguous) loaded from .fam.
Ambiguous sex IDs written to ./results/CAD_clumped_results.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 10000 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889

4871 more top variant IDs missing; see log file.


--clump: 136 clumps formed from 969 top variants.
Results written to ./results/CAD_clumped_results.clumped .


In [15]:
%%bash

module load PLINK/1

plink --bfile '/gpfs/gibbs/project/bdsi/shared/Genetics/data/sim_data/geno_data/EUR' \
      --clump MDD_clean_clump.txt \
      --clump-p1 1e-05 \
      --clump-p2 1 \
      --clump-r2 0.1 \
      --clump-kb 250 \
      --out ./results/MDD_clumped_results


PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ./results/MDD_clumped_results.log.
Options in effect:
  --bfile /gpfs/gibbs/project/bdsi/shared/Genetics/data/sim_data/geno_data/EUR
  --clump MDD_clean_clump.txt
  --clump-kb 250
  --clump-p1 1e-05
  --clump-p2 1
  --clump-r2 0.1
  --out ./results/MDD_clumped_results

515691 MB RAM detected; reserving 257845 MB for main workspace.
1175356 variants loaded from .bim file.
10000 people (0 males, 0 females, 10000 ambiguous) loaded from .fam.
Ambiguous sex IDs written to ./results/MDD_clumped_results.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 10000 founders and 0 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889

10212 more top variant IDs missing; see log file.


--clump: 261 clumps formed from 2338 top variants.
Results written to ./results/MDD_clumped_results.clumped .


In [16]:
## CAD & MDD

# Step 1: Load the clumped results file
df1_lead_snps = pd.read_csv("./results/CAD_clumped_results.clumped", delim_whitespace=True)
df2_lead_snps = pd.read_csv("./results/MDD_clumped_results.clumped", delim_whitespace=True)

# Step 2: Extract SNP list as a Python list
lead_snp_list1 = df1_lead_snps["SNP"].tolist()
lead_snp_list2 = df2_lead_snps["SNP"].tolist()

# Step 3: Take the union of both lists
union_snps = list(set(lead_snp_list1).union(set(lead_snp_list2)))

# Step 4: Save the union of SNPs to a CSV file
pd.DataFrame({'SNP': union_snps}).to_csv("union_lead_snps.csv", index=False)

In [19]:
len(union_snps)

396