In [19]:
import gzip
import pandas as pd
import pandas_plink as pdpl
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import subprocess
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
# There are three file types for genetic data that interest us
# Multiple different samples of these files can be found on 1000 Genomes
# In the interest of size and reproducibility we will use just EUR.14 to showcase the method
# This method can be reproduced with any of the chromosomes included in the data folder
bim, fam, G = pdpl.read_plink("data/EUR.22.test/1000G.EUR.22")

# This file contains the gene expression data we will be using
gene_expr = pd.read_csv('data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz', compression='gzip', sep='\t')

# We also want to grab the gene annotations
with gzip.open("data/gene_annot.txt.gz", "rt") as f:
    # Read the file content into a pandas DataFrame
    gene_annotations = pd.read_csv(f, sep="\t")


Mapping files:   0%|          | 0/3 [00:00<?, ?it/s][A
Mapping files: 100%|██████████| 3/3 [00:00<00:00, 26.65it/s][A


In [28]:
# Combine the files so that we have users as rows, and their snps as columns
geno_df = pd.DataFrame(G.compute().transpose())
geno_df.columns = bim['snp']  # Use SNP IDs as columns
geno_df.index = fam['iid']  # Use individual IDs as rows

# Transpose gene expression data so that individual IDs are in the index as well
gene_expr_t = gene_expr.set_index('Gene_Symbol').transpose()

# The data isn't always consistent across all the files, so we want to only pull data we have all the information for
# Extract the individual IDs that match those in fam['iid']
gene_expr_t.index.name = 'iid'
matching_ids = gene_expr_t.index.intersection(fam['iid'])

# Filter both dataframes to only include these matching IDs
geno_df_filtered = geno_df.loc[matching_ids]
gene_expr_filtered = gene_expr_t.loc[matching_ids]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(geno_df_filtered, gene_expr_filtered, test_size=0.25, random_state=1)

In [51]:
# Grab genes from desired chromosomes
chromosome = "22"
chr22_genes = gene_expr[gene_expr['Chr'] == chromosome]['Gene_Symbol']

# Initialize a list to store results for each SNP and gene as rows
chr22_results_list = []

for the_gene in chr22_genes:
    gene_info = gene_expr[gene_expr['Gene_Symbol'] == the_gene]
    snps_in_window = bim[(bim['chrom'] == chromosome) & 
                         (bim['pos'] >= int(gene_info['Coord'].iloc[0])-500000) & 
                         (bim['pos'] <= int(gene_info['Coord'].iloc[0])+500000)]
    
    # Get the SNP IDs from the filtered SNPs
    snp_ids = snps_in_window['snp'].values

    # Filter the genotype DataFrame for the SNPs in the window
    X_train_filtered = X_train.loc[:, snp_ids]

    # Filter the gene expression data for the gene of interest
    y_train_t = y_train.transpose()
    y_train_filtered = y_train_t.loc[the_gene]

    # Merge the gene expression data with genotype data
    merged_df = pd.concat([X_train_filtered, y_train_filtered], axis=1)

    # Rename the gene expression column for clarity
    merged_df.rename(columns={the_gene: 'gene_expression'}, inplace=True)
    
    # Ensure gene expression values are numeric
    y = pd.to_numeric(merged_df['gene_expression'], errors='coerce')

    for snp in X_train_filtered.columns:
        X = pd.to_numeric(merged_df[snp], errors='coerce')  # Convert SNP data to numeric
        X = sm.add_constant(X)  # Add an intercept for the linear regression
    
        # Drop rows with missing or invalid data (NaN)
        valid_data = pd.concat([y, X], axis=1).dropna()
    
        # Run the regression only if valid data exists
        if len(valid_data) > 0:
            model = sm.OLS(valid_data['gene_expression'], valid_data.iloc[:, 1:]).fit()  # Use .iloc to access predictors

            if len(model.pvalues) > 1:
                # Append results for this SNP as a row in the results list
                chr22_results_list.append({
                    'Gene': the_gene,
                    'SNP': snp,
                    'p-value': model.pvalues.iloc[1],
                    'beta': model.params.iloc[1],
                    'r_squared': model.rsquared
                })
            else:
                print(f"Insufficient model output for SNP {snp}, skipping.")

# Convert the list of results into a DataFrame
chr22_results_df = pd.DataFrame(chr22_results_list)

Insufficient model output for SNP rs17001322, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs138168, skipping.
Insufficient model output for SNP rs138171, skipping.
Insufficient model output for SNP rs138168, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs5999980, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs17001322, skipping.
Insufficient model output for SNP rs5999980, skipping.
Insufficient model output for SNP rs8142085, skipping.
Insufficient model output for SNP rs17001322, skipping.
Insufficient model output for SNP rs5999980, skipping.
Insufficient model output for SNP rs138168, skipping.
Insufficient model output for SNP rs138171, skipping.
Insufficient

In [54]:
chr22_results_df.to_csv('chr22_eqtl_data.txt', sep='\t', index=False)

In [None]:
# For the report doing all of the chromosomes:

### Chromosome 1

In [53]:
bim1, fam1, G1 = pdpl.read_plink("data/EUR.1.test/1000G.EUR.1")
gene_expr1 = pd.read_csv('data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz', compression='gzip', sep='\t')

# Combine the files so that we have users as rows, and their snps as columns
geno_df1 = pd.DataFrame(G1.compute().transpose())
geno_df1.columns = bim1['snp']  # Use SNP IDs as columns
geno_df1.index = fam1['iid']  # Use individual IDs as rows

# Transpose gene expression data so that individual IDs are in the index as well
gene_expr_t1 = gene_expr1.set_index('Gene_Symbol').transpose()

# The data isn't always consistent across all the files, so we want to only pull data we have all the information for
# Extract the individual IDs that match those in fam['iid']
gene_expr_t1.index.name = 'iid'
matching_ids1 = gene_expr_t1.index.intersection(fam1['iid'])

# Filter both dataframes to only include these matching IDs
geno_df_filtered1 = geno_df1.loc[matching_ids1]
gene_expr_filtered1 = gene_expr_t1.loc[matching_ids1]

# Split into train and test
X_train1, X_test1, y_train1, y_test1 = train_test_split(geno_df_filtered1, gene_expr_filtered1, test_size=0.25, random_state=1)

Mapping files: 100%|██████████| 3/3 [00:01<00:00,  2.14it/s]


In [66]:
# Grab genes from desired chromosomes
chromosome = "1"
chr1_genes = gene_expr1[gene_expr1['Chr'] == chromosome]['Gene_Symbol']

# Initialize a list to store results for each SNP and gene as rows
chr1_results_list = []

for the_gene in chr1_genes:
    gene_info = gene_expr1[gene_expr1['Gene_Symbol'] == the_gene]
    snps_in_window = bim1[(bim1['chrom'] == chromosome) & 
                         (bim1['pos'] >= int(gene_info['Coord'].iloc[0])-500000) & 
                         (bim1['pos'] <= int(gene_info['Coord'].iloc[0])+500000)]
    
    # Get the SNP IDs from the filtered SNPs
    snp_ids = snps_in_window['snp'].values

    # Filter the genotype DataFrame for the SNPs in the window
    X_train_filtered = X_train1.loc[:, snp_ids]

    # Filter the gene expression data for the gene of interest
    y_train_t = y_train1.transpose()
    y_train_filtered = y_train_t.loc[the_gene]

    # Merge the gene expression data with genotype data
    merged_df = pd.concat([X_train_filtered, y_train_filtered], axis=1)

    # Rename the gene expression column for clarity
    merged_df.rename(columns={the_gene: 'gene_expression'}, inplace=True)
    
    # Ensure gene expression values are numeric
    y = pd.to_numeric(merged_df['gene_expression'], errors='coerce')

    for snp in X_train_filtered.columns:
        X = pd.to_numeric(merged_df[snp], errors='coerce')  # Convert SNP data to numeric
        X = sm.add_constant(X)  # Add an intercept for the linear regression
    
        # Drop rows with missing or invalid data (NaN)
        valid_data = pd.concat([y, X], axis=1).dropna()
    
        # Run the regression only if valid data exists
        if len(valid_data) > 0:
            model = sm.OLS(valid_data['gene_expression'], valid_data.iloc[:, 1:]).fit()  # Use .iloc to access predictors

            if len(model.pvalues) > 1:
                # Append results for this SNP as a row in the results list
                chr1_results_list.append({
                    'Gene': the_gene,
                    'SNP': snp,
                    'p-value': model.pvalues.iloc[1],
                    'beta': model.params.iloc[1],
                    'r_squared': model.rsquared
                })
            else:
                print(f"Insufficient model output for SNP {snp}, skipping.")

# Convert the list of results into a DataFrame
chr1_results_df = pd.DataFrame(chr1_results_list)

Insufficient model output for SNP rs2782831, skipping.
Insufficient model output for SNP rs9429946, skipping.
Insufficient model output for SNP rs4259582, skipping.
Insufficient model output for SNP rs2501420, skipping.
Insufficient model output for SNP rs7537933, skipping.
Insufficient model output for SNP rs12075758, skipping.
Insufficient model output for SNP rs4259582, skipping.
Insufficient model output for SNP rs2501420, skipping.
Insufficient model output for SNP rs7537933, skipping.
Insufficient model output for SNP rs7531646, skipping.
Insufficient model output for SNP rs12075758, skipping.
Insufficient model output for SNP rs12065314, skipping.
Insufficient model output for SNP rs11805314, skipping.
Insufficient model output for SNP rs2819390, skipping.
Insufficient model output for SNP rs2819386, skipping.
Insufficient model output for SNP rs7547835, skipping.
Insufficient model output for SNP rs2782831, skipping.
Insufficient model output for SNP rs9429946, skipping.
Insuff

KeyboardInterrupt: 

In [None]:
chr1_results_df.to_csv('chr1_eqtl_data.txt', sep='\t', index=False)

### Chromosome 14

In [None]:
bim14, fam14, G14 = pdpl.read_plink("data/EUR.14.test/1000G.EUR.14")
gene_expr14 = pd.read_csv('data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz', compression='gzip', sep='\t')

# Combine the files so that we have users as rows, and their snps as columns
geno_df14 = pd.DataFrame(G1.compute().transpose())
geno_df14.columns = bim14['snp']  # Use SNP IDs as columns
geno_df14.index = fam14['iid']  # Use individual IDs as rows

# Transpose gene expression data so that individual IDs are in the index as well
gene_expr_t14 = gene_expr14.set_index('Gene_Symbol').transpose()

# The data isn't always consistent across all the files, so we want to only pull data we have all the information for
# Extract the individual IDs that match those in fam['iid']
gene_expr_t14.index.name = 'iid'
matching_ids14 = gene_expr_t14.index.intersection(fam1['iid'])

# Filter both dataframes to only include these matching IDs
geno_df_filtered14 = geno_df14.loc[matching_ids1]
gene_expr_filtered14 = gene_expr_t14.loc[matching_ids1]

# Split into train and test
X_train14, X_test14, y_train14, y_test14 = train_test_split(geno_df_filtered14, gene_expr_filtered14, test_size=0.25, random_state=1)

In [None]:
# Grab genes from desired chromosomes
chromosome = "14"
chr14_genes = gene_expr1[gene_expr1['Chr'] == chromosome]['Gene_Symbol']

# Initialize a list to store results for each SNP and gene as rows
chr14_results_list = []

for the_gene in chr14_genes:
    gene_info = gene_expr1[gene_expr14['Gene_Symbol'] == the_gene]
    snps_in_window = bim14[(bim14['chrom'] == chromosome) & 
                         (bim14['pos'] >= int(gene_info['Coord'].iloc[0])-500000) & 
                         (bim14['pos'] <= int(gene_info['Coord'].iloc[0])+500000)]
    
    # Get the SNP IDs from the filtered SNPs
    snp_ids = snps_in_window['snp'].values

    # Filter the genotype DataFrame for the SNPs in the window
    X_train_filtered = X_train14.loc[:, snp_ids]

    # Filter the gene expression data for the gene of interest
    y_train_t = y_train14.transpose()
    y_train_filtered = y_train_t.loc[the_gene]

    # Merge the gene expression data with genotype data
    merged_df = pd.concat([X_train_filtered, y_train_filtered], axis=1)

    # Rename the gene expression column for clarity
    merged_df.rename(columns={the_gene: 'gene_expression'}, inplace=True)
    
    # Ensure gene expression values are numeric
    y = pd.to_numeric(merged_df['gene_expression'], errors='coerce')

    for snp in X_train_filtered.columns:
        X = pd.to_numeric(merged_df[snp], errors='coerce')  # Convert SNP data to numeric
        X = sm.add_constant(X)  # Add an intercept for the linear regression
    
        # Drop rows with missing or invalid data (NaN)
        valid_data = pd.concat([y, X], axis=1).dropna()
    
        # Run the regression only if valid data exists
        if len(valid_data) > 0:
            model = sm.OLS(valid_data['gene_expression'], valid_data.iloc[:, 1:]).fit()  # Use .iloc to access predictors

            if len(model.pvalues) > 1:
                # Append results for this SNP as a row in the results list
                chr14_results_list.append({
                    'Gene': the_gene,
                    'SNP': snp,
                    'p-value': model.pvalues.iloc[1],
                    'beta': model.params.iloc[1],
                    'r_squared': model.rsquared
                })
            else:
                print(f"Insufficient model output for SNP {snp}, skipping.")

# Convert the list of results into a DataFrame
chr14_results_df = pd.DataFrame(chr14_results_list)

In [None]:
chr14_results_df.to_csv('chr14_eqtl_data.txt', sep='\t', index=False)