# UKB Virus Cases

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

#Directory in Biowulf
os.chdir('/PATH/TO/UKB_Files')

# Loading Files

In [None]:
#Loading all the reference files for UKB
NDD_free = pd.read_csv('ALL_NDD_FREE_CONTROLS_AGE60PLUS.txt', delimiter='\t') #NDD free controls, subset of phenome, all European ancestry
phenome = pd.read_csv('covariates_phenome_to_use.txt', delimiter='\t') #All individuals
massive_ICD10 = pd.read_csv('massive_ICD10_ALL_table.txt', delimiter='\t', header = None) #ICD10 codes per individual

In [None]:
#Loading all the disease files for UKB
AD = pd.read_csv("alzheimer_disease.txt", delimiter = '\t')
ALS = pd.read_csv('ALS.IDs', header = None) 
dementia = pd.read_csv('Dementia.IDs', header = None)
PD = pd.read_csv('parkinson_disease.txt', delimiter='\t')
vascular = pd.read_csv('Vascular.IDs', header = None)

In [None]:
#Getting MS IDs
#G35 is the ICD10 code for Multiple sclerosis
G35 = massive_ICD10.loc[massive_ICD10[1] == 'G35']
list_MS = G35[0]
MS = phenome[phenome['IID'].isin(list_MS)]
MS = MS.rename(columns = {'FID': 0})

# Adding Disease Column

In [None]:
#Pick your NDD here
NDD = AD
ndd = "AD"

In [None]:
#Add disease column to NDD free df
NDD_free[ndd] = 0

#Drop FID, Batch, and European
NDD_free = NDD_free.drop(columns = ['FID', 'BATCH', "EUROPEAN"])

#Rename ID column
NDD_free = NDD_free.rename(columns = {'IID': 'ID'})
print("Number of controls:", len(NDD_free))

In [None]:
#Creating df of people with NDD
NDD = NDD.rename(columns = {'eid': 0})
NDD_list = list(NDD[0])
has_NDD = phenome[phenome['IID'].isin(NDD_list)]

#Only select Europeans to match controls
has_NDD = has_NDD[has_NDD['EUROPEAN'] == 1]

#Drop FID, Batch, and European
has_NDD = has_NDD.drop(columns = ['FID', 'BATCH', "EUROPEAN"])

#Rename ID column
has_NDD = has_NDD.rename(columns = {'IID': 'ID'})

#Add NDD column
has_NDD[ndd] = 1

print(f"Number of individuals with {ndd}:", len(has_NDD))

In [None]:
#Combine NDD_free and has_NDD
df = pd.concat([NDD_free, has_NDD])
df.head()

# Adding ICD10 Codes to dataframe

In [None]:
#Number of codes -- see Supplementary Table 1
search_terms = pd.read_csv('Supplementary_Table_1 - FinnGen_Codes.csv')

phenocode_list = list(search_terms['FinnGen Phenocode'].drop_duplicates())
ukb_codes = search_terms.loc[search_terms['Cohort']=='UKB']
ukb_code_list = list(ukb_codes['ICD10 Codes'])
predictor_meaning = list(search_terms['Description'].drop_duplicates())

unique_codes = []
for code in ukb_code_list:
    unique_codes.append(code.split(','))

clean = []
for i in unique_codes[0]:
    clean.append(i.strip())
print(clean)

In [None]:
print("Unique codes:", len(unique_codes))
print("Phenocode_list:", len(phenocode_list))
phenocode_list

flat_list = []
for xs in unique_codes:
    for x in xs:
        flat_list.append(x.strip())

In [None]:
#Finding Viral codes in ICD10 list
for code in flat_list:
    viral_ICD10 = massive_ICD10[massive_ICD10[1] == code]
    viral_ICD10 = viral_ICD10.rename(columns = {0: 'ID', 1: "Code"})
    viral_ICD10 = list(viral_ICD10["ID"])
    df[code] = np.where(df['ID'].isin(viral_ICD10), 1, 0)

In [None]:
#Checking that ICD10 columns were added
df.columns

In [None]:
print("Unique codes:", len(unique_codes))
#print(unique_codes)
print("Phenocode_list:", len(phenocode_list))
#print(phenocode_list)
print(unique_codes[0])
print(phenocode_list[0])

In [None]:
#Adding the FinnGen Groupings to the dataframe
for j in range(len(unique_codes)):
    cols = []
    for i in unique_codes[j]:
        cols.append(i.strip())

    df[phenocode_list[j]] = df[cols].sum(axis=1)
    df[phenocode_list[j]].values[df[phenocode_list[j]] > 1] = 1

In [None]:
#Fill nan values with 0
df = df.fillna(0)
df.head()

In [None]:
# Create lists for the regression
predictor_list = phenocode_list
predictor_meaning = predictor_meaning

# Regressions

In [None]:
# Now nothing left to do is run the regressions and call it a day. 
from statsmodels.stats.multitest import fdrcorrection
results = []

# for predictor in range(1, 10):
for predictor in range(len(predictor_list)):
  predictor_name = predictor_list[predictor]
  predictor_description = predictor_meaning[predictor]
  this_formula = ndd + "~ df['" + predictor_list[predictor] + "']" + " + AGE_OF_RECRUIT + TOWNSEND + GENETIC_SEX"
  fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df).fit()
  beta_coef  = fitted.params.loc["df['" + predictor_name + "']"]
  beta_se  = fitted.bse.loc["df['" + predictor_name + "']"]
  p_val = fitted.pvalues.loc["df['" + predictor_name + "']"]
  odds_ratio = np.exp(fitted.params.loc["df['" + predictor_name + "']"])
  conf = fitted.conf_int().loc["df['" + predictor_name + "']"]
  m5, m95 = np.exp(conf)
  n = sum(df[predictor_name])
  df2 = df[df[predictor_name]==1]
  n_pairs = sum(df2[ndd])  

  print(predictor_name, odds_ratio, m5, m95, p_val, n_pairs, n)
  results.append((ndd, predictor_name, predictor_description, odds_ratio, m5, m95, p_val, n_pairs, n))

output = pd.DataFrame(results, columns=('NDD','CODE', 'DESCRIPTION','odds_ratio', 'ci_min', "ci_max", 'P_VAL', "N_pairs", "N"))

In [None]:
#Only looking at codes that have at least 3 pairings
output = output[output['N_pairs'] > 2]
output

In [None]:
#Adding FDR Correction

#Sort P-values
output = output.sort_values(by = "P_VAL")

#Drop Nan-values
output = output.dropna()

#FDR Correction
rejected, p_corr = fdrcorrection(output['P_VAL'], is_sorted=True)
output['P_CORR'] = p_corr
output['REJECTED'] = rejected

In [None]:
#Check results
output.head()

In [None]:
#Save output
output.to_csv('/PATH/TO/regression_results/' + ndd + "_virus_UKB_ALL.csv", index=False)