# Setup
Install and import required packages 
Extract data if it has not already been extracted

In [None]:
# Cell 1

# Uncomment the line below if there are some packages missing
#!pip install -r ../requirements.txt

In [1]:
# Cell 2
import warnings
warnings.filterwarnings('ignore')

import pandas_plink as pp
import statsmodels.api as sm
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.metrics import r2_score
import random
import os
import tarfile
import gzip
from tools import *

################################################################
# Uncomment the line below if the files required are missing
# extract_data()
################################################################
%load_ext autoreload
%autoreload 2

# Ensuring correct installation and setup of plink2

In [2]:
# Cell 3



# IF RUNNING THIS RESULTS IN THE ERROR MESSAGE:
# /bin/bash: ./plink2: Permission denied
# UNCOMMENT THE LINE OF CODE BELOW THIS AND RERUN THE CELL THEN RECOMMENT

#######################################
#!chmod 700 plink2
#!chmod 700 plink
#######################################

# OTHERWISE, SHOULD OUTPUT SOMETHING LIKE 
"""
PLINK v2.0.0-a.6 32-bit (20 Oct 2024)              cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,
--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,
--write-samples, --write-snplist, --make-grm-list, --pca, --glm, --adjust-file,
--gwas-ssf, --pheno-svd, --clump, --score-list, --variant-score,
--genotyping-rate, --pgen-info, --validate, and --zst-decompress.

"plink2 --help | more" describes all functions.

"""
!./plink2


PLINK v2.0.0-a.6 32-bit (20 Oct 2024)              cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,
--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,
--write-samples, --write-snplist, --make-grm-list, --pca, --glm, --adjust-file,
--gwas-ssf, --pheno-svd, --clump, --score-list, --variant-score,
--genotyping-rate, --pgen-info, --validate, and --zst-decompress.

"plink2 --help | more" describes all functions.


# Loading data

In [3]:
# Cell 4
phenotype_data = '../data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz'

# alleles = .bim (snp. came from vcf file (txt file))
# samples = .fam (family id)
# genotypes = .bed (genotypes). Rows and columns defined by alleles and samples
(alleles, samples, genotypes) = pp.read_plink("../data/LDREF/1000G.EUR.*",
                             verbose=False)
genotypes = pd.DataFrame(genotypes.compute())
P = pd.read_csv(phenotype_data, sep='\t', compression='gzip')


# Conducting cis-eQTL analyses across all genes and chromosomes in the 1000 Genomes dataset

In [None]:
# Takes about 1 hour to run. Analysis files have already been created.
#all_chromosomes = alleles['chrom'].unique()
#output_fp_prefix = 'genome_wide_ciseqtl_analysis/analysis_'
#all_outs = []
#for chrom in all_chromosomes:
#    print(f'Starting analysis for chromosome {chrom}')
#    chr_analysis(chrom, alleles, samples, genotypes, P, window=500000).to_csv(output_fp_prefix+chrom+'.txt', sep='\t')

Starting analysis for chromosome 1
Starting cis-eQTL analyses for chromosome 1
0/2479 analyses completed
100/2479 analyses completed
200/2479 analyses completed
300/2479 analyses completed
400/2479 analyses completed
500/2479 analyses completed
600/2479 analyses completed
700/2479 analyses completed
800/2479 analyses completed
900/2479 analyses completed
1000/2479 analyses completed
1100/2479 analyses completed
1200/2479 analyses completed
1300/2479 analyses completed
1400/2479 analyses completed
1500/2479 analyses completed
1600/2479 analyses completed
1700/2479 analyses completed
1800/2479 analyses completed
1900/2479 analyses completed
2000/2479 analyses completed
2100/2479 analyses completed
2200/2479 analyses completed
2300/2479 analyses completed
2400/2479 analyses completed
2479/2479 analyses completed
Compiling eQTL analysis results
Starting analysis for chromosome 10
Starting cis-eQTL analyses for chromosome 10
0/932 analyses completed
100/932 analyses completed
200/932 analys

# Results from the genome wide cis-eQTL analysis

In [4]:
all_chromosomes = alleles['chrom'].unique()
output_fp_prefix = 'genome_wide_ciseqtl_analysis/analysis_'
all_outs = [output_fp_prefix+chrom+'.txt' for chrom in all_chromosomes]
all_chr_ciseqtl_dfs = [pd.read_table(out, index_col='Unnamed: 0') for out in all_outs]
genome_wide_ciseqtl_analysis = pd.concat(all_chr_ciseqtl_dfs, ignore_index=True)
genome_wide_ciseqtl_analysis

Unnamed: 0,snp,chr,bp,p,se,beta,a0,a1,i
0,rs6426089,1.0,225999979.0,0.030385,2.174076,0.524494,G,A,86772.0
1,rs1467143,1.0,226000967.0,0.030385,2.174076,0.524494,G,A,86773.0
2,rs868966,1.0,226006098.0,0.784214,0.274046,0.066733,G,A,86774.0
3,rs10753410,1.0,226008101.0,0.126770,1.530693,0.551515,C,T,86775.0
4,rs10799326,1.0,226009918.0,0.226326,1.212060,0.429218,C,T,86776.0
...,...,...,...,...,...,...,...,...,...
9234069,rs11998787,9.0,118057870.0,0.932509,-0.084750,-0.001781,A,G,1179771.0
9234070,rs9987570,9.0,118061922.0,0.924241,-0.095163,-0.002020,T,G,1179772.0
9234071,rs10982660,9.0,118063450.0,0.432366,0.786079,0.012098,A,G,1179773.0
9234072,rs10982662,9.0,118064892.0,0.534901,-0.621171,-0.007054,A,G,1179774.0


# Getting the significant SNPs from the analysis

In [5]:
p_val = 0.05/genome_wide_ciseqtl_analysis.shape[0]
print(f'p-value: {p_val}')
genome_wide_ciseqtl_analysis[genome_wide_ciseqtl_analysis['p'] < p_val]

p-value: 5.414728103760052e-09


Unnamed: 0,snp,chr,bp,p,se,beta,a0,a1,i
2930,rs9865,1.0,46827456.0,1.826516e-17,-8.981540,-0.361494,G,A,19475.0
2947,rs324419,1.0,46871986.0,4.723186e-13,-7.524525,-0.315464,T,C,19492.0
2955,rs7520850,1.0,46883668.0,8.812540e-17,-8.766008,-0.388400,A,G,19500.0
7579,rs10863389,1.0,206656292.0,1.020276e-10,6.671585,0.078233,T,C,77547.0
7581,rs11118092,1.0,206656770.0,8.164971e-14,-7.788729,-0.086488,T,C,77549.0
...,...,...,...,...,...,...,...,...,...
9221760,rs2805888,9.0,111880736.0,2.903130e-09,6.097941,0.036827,T,C,1176119.0
9221761,rs6825,9.0,111881927.0,2.082084e-11,-6.931497,-0.033476,C,T,1176120.0
9221767,rs12686736,9.0,111888739.0,5.331356e-09,-5.989441,-0.033173,G,A,1176126.0
9221769,rs17628095,9.0,111889172.0,5.331356e-09,-5.989441,-0.033173,A,G,1176128.0
