# FinnGenn Virus FDR Correction

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection
import warnings
warnings.filterwarnings("ignore")

#Directory in Biowulf
os.chdir('/PATH/TO/FinnGen_Data')

In [2]:
#Loading all the disease files
#Files downloaded from FinnGen on 5-5-22
AD_finn = pd.read_csv('G6_ALZHEIMER_survival-analyses.csv')
ALS_finn = pd.read_csv('G6_ALS_survival-analyses.csv')
dementia_finn = pd.read_csv('F5_DEMENTIA_survival-analyses.csv')
MS_finn = pd.read_csv('G6_MS_survival-analyses.csv')
PD_finn = pd.read_csv('G6_PARKINSON_survival-analyses.csv')
vascular_finn = pd.read_csv('F5_VASCDEM_survival-analyses.csv')

#Loading our serach term, i.e. list of viruses
search_terms = pd.read_csv('FinnGen_Search_Terms.csv') 

In [3]:
#Pick your NDD here
NDD = AD_finn
ndd = "AD"

In [4]:
#Select rows that are in our list of possible viruses
virus_list = list(search_terms['Endpoint'])
had_virus = NDD[NDD['prior_name'].isin(virus_list)]

#Setting lag time to 0
had_virus = had_virus.loc[had_virus['hr_lag'] == 0]

#Dropping columns we don't need
had_virus = had_virus.drop(['hr_lag', 'outcome_longname'], axis=1)
had_virus = had_virus[['outcome_name','prior_name', 'prior_longname', 'hr', 'ci_min', 'ci_max', 'p', 'N']]
had_virus = had_virus.rename(columns = {'outcome_name':'NDD', 'prior_name':'Viral_Code', 'prior_longname':'Virus_Description'})

In [5]:
#checking results
print(len(had_virus))
had_virus.head()

20


Unnamed: 0,NDD,Viral_Code,Virus_Description,hr,ci_min,ci_max,p,N
96,G6_ALZHEIMER,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",30.717384,11.841534,79.682047,1.89328e-12,24
786,G6_ALZHEIMER,J10_INFLUENZA,All influenza,6.809482,4.680332,9.907213,1.150327e-23,214
1109,G6_ALZHEIMER,E4_THYROIDITSUBAC,Subacute thyroiditis,4.976081,2.806601,8.822549,3.973861e-08,19
1140,G6_ALZHEIMER,G6_ENCEPHOTH,Other encephalitis,4.855506,1.650109,14.287499,0.00411059,21
1169,G6_ALZHEIMER,K11_CHRONHEP,"Chronic hepatitis, not elsewhere classified",4.708659,1.875044,11.824507,0.0009732839,16


In [6]:
#Adding FDR Correction

#Sort P-values
had_virus = had_virus.sort_values(by = "p")

#FDR Correction
rejected, p_corr = fdrcorrection(had_virus['p'], is_sorted=True)
had_virus['P_CORR'] = p_corr
had_virus['REJECTED'] = rejected

In [7]:
#Check results
had_virus.head()

Unnamed: 0,NDD,Viral_Code,Virus_Description,hr,ci_min,ci_max,p,N,P_CORR,REJECTED
1344,G6_ALZHEIMER,J10_INFLUPNEU,Influenza and pneumonia,4.107179,3.382205,4.987549,4.014982e-46,2141,8.029964e-45,True
786,G6_ALZHEIMER,J10_INFLUENZA,All influenza,6.809482,4.680332,9.907213,1.150327e-23,214,1.150327e-22,True
96,G6_ALZHEIMER,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",30.717384,11.841534,79.682047,1.89328e-12,24,1.262186e-11,True
1109,G6_ALZHEIMER,E4_THYROIDITSUBAC,Subacute thyroiditis,4.976081,2.806601,8.822549,3.973861e-08,19,1.98693e-07,True
1185,G6_ALZHEIMER,G6_BELLPA,Bell's palsy,4.649821,2.319405,9.321717,1.485348e-05,79,5.94139e-05,True


In [8]:
#Save to csv
had_virus.to_csv(ndd + "_virus_finn_gen.csv", index=False)

In [9]:
#Load and check results
df2 = pd.read_csv(ndd + "_virus_finn_gen.csv")
df2.head()

Unnamed: 0,NDD,Viral_Code,Virus_Description,hr,ci_min,ci_max,p,N,P_CORR,REJECTED
0,G6_ALZHEIMER,J10_INFLUPNEU,Influenza and pneumonia,4.107179,3.382205,4.987549,4.014982e-46,2141,8.029964e-45,True
1,G6_ALZHEIMER,J10_INFLUENZA,All influenza,6.809482,4.680332,9.907213,1.150327e-23,214,1.150327e-22,True
2,G6_ALZHEIMER,AB1_VIRAL_ENCEPHALITIS_NOS,"Viral encephalitis, not elsewhere classified/u...",30.717384,11.841534,79.682047,1.89328e-12,24,1.262186e-11,True
3,G6_ALZHEIMER,E4_THYROIDITSUBAC,Subacute thyroiditis,4.976081,2.806601,8.822549,3.973861e-08,19,1.98693e-07,True
4,G6_ALZHEIMER,G6_BELLPA,Bell's palsy,4.649821,2.319405,9.321717,1.485348e-05,79,5.94139e-05,True
