# UKB Virus Cases

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

#Directory in Biowulf
os.chdir('/PATH/TO/UKB_Files')

# Loading Files

In [2]:
#Loading all the reference files
NDD_free = pd.read_csv('ALL_NDD_FREE_CONTROLS_AGE60PLUS.txt', delimiter='\t')
phenome = pd.read_csv('covariates_phenome_to_use.txt', delimiter='\t')
massive_ICD10 = pd.read_csv('massive_ICD10_ALL_table.txt', delimiter='\t', header = None)

In [3]:
#Loading all the disease files
AD = pd.read_csv("alzheimer_disease.txt", delimiter = '\t')
ALS = pd.read_csv('ALS.IDs', header = None) 
dementia = pd.read_csv('Dementia.IDs', header = None)
PD = pd.read_csv('parkinson_disease.txt', delimiter='\t')
vascular = pd.read_csv('Vascular.IDs', header = None)

In [4]:
#Getting MS IDs
#G35 is the ICD10 code for Multiple sclerosis
G35 = massive_ICD10.loc[massive_ICD10[1] == 'G35']
list_MS = G35[0]
MS = phenome[phenome['IID'].isin(list_MS)]
MS = MS.rename(columns = {'FID': 0})
#MS

# Adding Disease Column

In [5]:
#Pick your NDD here
NDD = AD
ndd = "AD"

In [6]:
#Add disease column to NDD free df
NDD_free[ndd] = 0

#Drop FID, Batch, and European
NDD_free = NDD_free.drop(columns = ['FID', 'BATCH', "EUROPEAN"])

#Rename ID column
NDD_free = NDD_free.rename(columns = {'IID': 'ID'})
print("Number of controls:", len(NDD_free))
#NDD_free

Number of controls: 96390


In [7]:
#Creating df of people with AD
NDD = NDD.rename(columns = {'eid': 0})
NDD_list = list(NDD[0])
has_NDD = phenome[phenome['IID'].isin(NDD_list)]

#Only select Europeans
has_NDD = has_NDD[has_NDD['EUROPEAN'] == 1]

#Drop FID, Batch, and European
has_NDD = has_NDD.drop(columns = ['FID', 'BATCH', "EUROPEAN"])

#Rename ID column
has_NDD = has_NDD.rename(columns = {'IID': 'ID'})

#Add NDD column
has_NDD[ndd] = 1

print(f"Number of individuals with {ndd}:", len(has_NDD))

Number of individuals with AD: 2342


In [8]:
#Combine NDD_free and has_NDD
df = pd.concat([NDD_free, has_NDD])
df

Unnamed: 0,ID,BIRTH_YEAR,TOWNSEND,AGE_OF_RECRUIT,GENETIC_SEX,AD
0,1000012,1949,-4.901740,61,1,0
1,1000047,1943,-2.440140,65,0,0
2,1000068,1948,-4.377210,61,1,0
3,1000085,1944,-0.774111,65,0,0
4,1000094,1946,-1.702940,61,1,0
...,...,...,...,...,...,...
501957,6019584,1939,3.642590,69,0,1
502065,6020668,1941,-4.469670,68,0,1
502072,6020733,1940,-2.586190,69,0,1
502341,6023425,1948,-3.100560,60,1,1


# Adding ICD10 Codes to dataframe

In [9]:
#Number of codes
search_terms = pd.read_csv('/PATH/TO/FinnGen_UKB_Codes.csv')
search_terms = search_terms.drop(columns=['number of cases - UKB'])
search_terms = search_terms.dropna()
search_terms = search_terms[['phenocode', 'UKB Codes', 'phenotype']]

phenocode_list = list(search_terms['phenocode'])
ukb_code_list = list(search_terms['UKB Codes'])
predictor_meaning = list(search_terms['phenotype'])

unique_codes = []
for code in ukb_code_list:
    unique_codes.append(code.split(','))

clean = []
for i in unique_codes[0]:
    clean.append(i.strip())
print(clean)

['A080', 'A081', 'A082', 'A083', 'A084', 'A085']


In [10]:
print("Unique codes:", len(unique_codes))
print("Phenocode_list:", len(phenocode_list))
phenocode_list

flat_list = []
for xs in unique_codes:
    for x in xs:
        flat_list.append(x.strip())
        
#print(flat_list)

Unique codes: 32
Phenocode_list: 32


In [11]:
#Finding Viral codes in ICD10 list
for code in flat_list:
    viral_ICD10 = massive_ICD10[massive_ICD10[1] == code]
    viral_ICD10 = viral_ICD10.rename(columns = {0: 'ID', 1: "Code"})
    viral_ICD10 = list(viral_ICD10["ID"])
    df[code] = np.where(df['ID'].isin(viral_ICD10), 1, 0)

In [12]:
#Checking that ICD10 columns were added
df.columns

Index(['ID', 'BIRTH_YEAR', 'TOWNSEND', 'AGE_OF_RECRUIT', 'GENETIC_SEX', 'AD',
       'A080', 'A081', 'A082', 'A083',
       ...
       'I401', 'I408', 'I409', 'I41', 'L511', 'L512', 'L518', 'L519', 'E061',
       'J00'],
      dtype='object', length=185)

In [13]:
print("Unique codes:", len(unique_codes))
#print(unique_codes)
print("Phenocode_list:", len(phenocode_list))
#print(phenocode_list)
print(unique_codes[0])
print(phenocode_list[0])

Unique codes: 32
Phenocode_list: 32
['A080', ' A081', ' A082', ' A083', ' A084', ' A085']
AB1_VIRAL_OTHER_INTEST_INFECTIONS


In [14]:
#Adding the FinnGen Groupings to the dataframe
for j in range(len(unique_codes)):
    cols = []
    for i in unique_codes[j]:
        cols.append(i.strip())

    df[phenocode_list[j]] = df[cols].sum(axis=1)
    df[phenocode_list[j]].values[df[phenocode_list[j]] > 1] = 1

In [15]:
#Fill nan values with 0
df = df.fillna(0)
df

Unnamed: 0,ID,BIRTH_YEAR,TOWNSEND,AGE_OF_RECRUIT,GENETIC_SEX,AD,A080,A081,A082,A083,...,J10_INFLUPNEU,J10_INFLUENZA,INFLUENZA,G6_BELLPA,AB1_VARICELLA,AB1_ZOSTER,I9_MYOCARD,L12_ERYTHEMAMULTIF,E4_THYROIDITSUBAC,J10_COLD
0,1000012,1949,-4.901740,61,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000047,1943,-2.440140,65,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000068,1948,-4.377210,61,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000085,1944,-0.774111,65,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000094,1946,-1.702940,61,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501957,6019584,1939,3.642590,69,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
502065,6020668,1941,-4.469670,68,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
502072,6020733,1940,-2.586190,69,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
502341,6023425,1948,-3.100560,60,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Create lists for the regression
predictor_list = phenocode_list
predictor_meaning = predictor_meaning

# Regressions

In [17]:
# Now nothing left to do is run the regressions and call it a day. 
from statsmodels.stats.multitest import fdrcorrection
results = []

# for predictor in range(1, 10):
for predictor in range(len(predictor_list)):
  predictor_name = predictor_list[predictor]
  predictor_description = predictor_meaning[predictor]
  this_formula = ndd + "~ df['" + predictor_list[predictor] + "']" + " + AGE_OF_RECRUIT + TOWNSEND + GENETIC_SEX"
  fitted = sm.formula.glm(formula=this_formula, family=sm.families.Binomial(), data=df).fit()
  beta_coef  = fitted.params.loc["df['" + predictor_name + "']"]
  beta_se  = fitted.bse.loc["df['" + predictor_name + "']"]
  p_val = fitted.pvalues.loc["df['" + predictor_name + "']"]
  odds_ratio = np.exp(fitted.params.loc["df['" + predictor_name + "']"])
  conf = fitted.conf_int().loc["df['" + predictor_name + "']"]
  m5, m95 = np.exp(conf)
  n = sum(df[predictor_name])
  df2 = df[df[predictor_name]==1]
  n_pairs = sum(df2[ndd])  

  print(predictor_name, odds_ratio, m5, m95, p_val, n_pairs, n)
  results.append((ndd, predictor_name, predictor_description, odds_ratio, m5, m95, p_val, n_pairs, n))

output = pd.DataFrame(results, columns=('NDD','CODE', 'DESCRIPTION','odds_ratio', 'ci_min', "ci_max", 'P_VAL', "N_pairs", "N"))

AB1_VIRAL_OTHER_INTEST_INFECTIONS 3.0940609168285516 1.9313412013323494 4.956769394471464 2.63476953172728e-06 19 249
AB1_VIRAL_HEMOR_FEVER_NOS 1.0000000000000413 1.000000000000037 1.0000000000000457 9.247159469078615e-77 0 0
AB1_VIRAL_WARTS 1.6597055966345904 0.8162518376400794 3.3747215509667416 0.16174341190687802 8 202
AB1_VIRAL_SKIN_MUCOUS_MEMBRANE 1.5170197441542745 0.9182941510782978 2.5061129938065756 0.10370302441554188 16 430
AB1_ANOGENITAL_HERPES_SIMPLEX 1.0000000000000413 1.000000000000037 1.0000000000000457 9.247159469078615e-77 0 0
J10_VIRALPNEUMO 1.3907272512295814 0.3381314634809393 5.7200305094400505 0.6475778591660974 2 57
H7_HERPESKERATITIS 5.681052009896327e-09 0.0 inf 0.9987790189476798 0 15
AB1_VIRAL_NOS 1.9050021122803429 1.1680858712979578 3.106820429015246 0.009807586698138374 17 378
AB1_VIRAL_MENINGITIS 6.339841866791054e-09 0.0 inf 0.998866415376507 0 13
AB1_VIRAL_INOTHER 2.534564708955023 1.023540105633524 6.276274108383951 0.04440403427280231 5 85
AB1_VIRAL

In [19]:
#Only looking at codes that have at least 3 pairings
output = output[output['N_pairs'] > 2]
output

Unnamed: 0,NDD,CODE,DESCRIPTION,odds_ratio,ci_min,ci_max,P_VAL,N_pairs,N
15,AD,AB1_BACT_BIR_OTHER_INF_AGENTS,"Bacterial, viral and other infectious agents",2.698269,2.307814,3.154785,1.464464e-35,183,2936
13,AD,AB1_OTHER_VIRAL,Other viral diseases,2.022796,1.287508,3.178004,0.002240781,20,424
12,AD,AB1_VIRAL_CNS,Viral infections of the central nervous system,5.199928,1.557018,17.36605,0.007370496,3,27
11,AD,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",22.05926,5.471083,88.942325,1.367661e-05,3,9
10,AD,AB1_VIRAL_HEPATITIS,Viral hepatitis,4.218904,2.121242,8.390913,4.068017e-05,9,104
9,AD,AB1_VIRAL_INOTHER,Viral agents as the cause of diseases classifi...,2.534565,1.02354,6.276274,0.04440403,5,85
7,AD,AB1_VIRAL_NOS,"Other viral diseases, not elsewhere classified",1.905002,1.168086,3.10682,0.009807587,17,378
0,AD,AB1_VIRAL_OTHER_INTEST_INFECTIONS,Viral and other specified intestinal infections,3.094061,1.931341,4.956769,2.63477e-06,19,249
3,AD,AB1_VIRAL_SKIN_MUCOUS_MEMBRANE,Viral infections characterized by skin and muc...,1.51702,0.918294,2.506113,0.103703,16,430
2,AD,AB1_VIRAL_WARTS,Viral warts,1.659706,0.816252,3.374722,0.1617434,8,202


In [20]:
#Adding FDR Correction

#Sort P-values
output = output.sort_values(by = "P_VAL")

#Drop Nan-values
output = output.dropna()

#FDR Correction
rejected, p_corr = fdrcorrection(output['P_VAL'], is_sorted=True)
output['P_CORR'] = p_corr
output['REJECTED'] = rejected

In [21]:
output

Unnamed: 0,NDD,CODE,DESCRIPTION,odds_ratio,ci_min,ci_max,P_VAL,N_pairs,N,P_CORR,REJECTED
22,AD,J10_INFLUPNEU,Influenza and pneumonia,2.597245,2.254915,2.991545,5.473309e-40,231,3788,1.0946619999999999e-38,True
15,AD,AB1_BACT_BIR_OTHER_INF_AGENTS,"Bacterial, viral and other infectious agents",2.698269,2.307814,3.154785,1.464464e-35,183,2936,1.464464e-34,True
19,AD,MENINGITIS,Meningitis,62.19728,18.35365,210.775621,3.292997e-11,6,11,2.195332e-10,True
0,AD,AB1_VIRAL_OTHER_INTEST_INFECTIONS,Viral and other specified intestinal infections,3.094061,1.931341,4.956769,2.63477e-06,19,249,1.317385e-05,True
11,AD,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",22.05926,5.471083,88.942325,1.367661e-05,3,9,5.470643e-05,True
10,AD,AB1_VIRAL_HEPATITIS,Viral hepatitis,4.218904,2.121242,8.390913,4.068017e-05,9,104,0.0001356006,True
13,AD,AB1_OTHER_VIRAL,Other viral diseases,2.022796,1.287508,3.178004,0.002240781,20,424,0.006151013,True
29,AD,L12_ERYTHEMAMULTIF,Erythema multiforme,6.737057,1.959963,23.157548,0.002460405,3,20,0.006151013,True
12,AD,AB1_VIRAL_CNS,Viral infections of the central nervous system,5.199928,1.557018,17.36605,0.007370496,3,27,0.01637888,True
7,AD,AB1_VIRAL_NOS,"Other viral diseases, not elsewhere classified",1.905002,1.168086,3.10682,0.009807587,17,378,0.01961517,True


In [22]:
#Save output
output.to_csv('/PATH/TO/regression_results/' + ndd + "_virus_UKB_ALL.csv", index=False)

In [23]:
#Check results
df2 = pd.read_csv('/PATH/TO/regression_results/' + ndd + "_virus_UKB_ALL.csv")
df2.head(25)

Unnamed: 0,NDD,CODE,DESCRIPTION,odds_ratio,ci_min,ci_max,P_VAL,N_pairs,N,P_CORR,REJECTED
0,AD,J10_INFLUPNEU,Influenza and pneumonia,2.597245,2.254915,2.991545,5.473309e-40,231,3788,1.0946619999999999e-38,True
1,AD,AB1_BACT_BIR_OTHER_INF_AGENTS,"Bacterial, viral and other infectious agents",2.698269,2.307814,3.154785,1.464464e-35,183,2936,1.464464e-34,True
2,AD,MENINGITIS,Meningitis,62.19728,18.35365,210.775621,3.292997e-11,6,11,2.195332e-10,True
3,AD,AB1_VIRAL_OTHER_INTEST_INFECTIONS,Viral and other specified intestinal infections,3.094061,1.931341,4.956769,2.63477e-06,19,249,1.317385e-05,True
4,AD,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",22.05926,5.471083,88.942325,1.367661e-05,3,9,5.470643e-05,True
5,AD,AB1_VIRAL_HEPATITIS,Viral hepatitis,4.218904,2.121242,8.390913,4.068017e-05,9,104,0.0001356006,True
6,AD,AB1_OTHER_VIRAL,Other viral diseases,2.022796,1.287508,3.178004,0.002240781,20,424,0.006151013,True
7,AD,L12_ERYTHEMAMULTIF,Erythema multiforme,6.737057,1.959963,23.157548,0.002460405,3,20,0.006151013,True
8,AD,AB1_VIRAL_CNS,Viral infections of the central nervous system,5.199928,1.557018,17.36605,0.007370496,3,27,0.01637888,True
9,AD,AB1_VIRAL_NOS,"Other viral diseases, not elsewhere classified",1.905002,1.168086,3.10682,0.009807587,17,378,0.01961517,True
