# FinnGenn Virus Both Directions

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection
import warnings
warnings.filterwarnings("ignore")

#Directory in Biowulf
os.chdir('/PATH/TO/FinnGen_Data')

In [None]:
#Loading all the disease files
#Files downloaded on 5-5-22
AD_finn = pd.read_csv('G6_ALZHEIMER_survival-analyses.csv')
ALS_finn = pd.read_csv('G6_ALS_survival-analyses.csv')
dementia_finn = pd.read_csv('F5_DEMENTIA_survival-analyses.csv')
MS_finn = pd.read_csv('G6_MS_survival-analyses.csv')
PD_finn = pd.read_csv('G6_PARKINSON_survival-analyses.csv')
vascular_finn = pd.read_csv('F5_VASCDEM_survival-analyses.csv')

#Loading our serach term, i.e. list of viruses -- created in previous step "FinnGen_Search_Terms"
search_terms = pd.read_csv('FinnGen_Search_Terms.csv') 

In [None]:
#Pick your NDD here
NDD = AD_finn
ndd = "AD"
NDD_code = "G6_ALZHEIMER"

In [None]:
#Select rows that are in our list of possible viruses
virus_list = list(search_terms['phenocode'])
had_virus_before = NDD[NDD['prior_name'].isin(virus_list)]
had_virus_after = NDD[NDD['outcome_name'].isin(virus_list)]

In [None]:
#checking results
print(len(had_virus_before))
print(len(had_virus_after))

In [None]:
#Combine df for had_virus_before NDD and had_virus_after NDD
had_virus = pd.concat([had_virus_before, had_virus_after])
had_virus

In [None]:
#Adding FDR Correction

#Sort P-values
had_virus = had_virus.sort_values(by = "p")

#FDR Correction
rejected, p_corr = fdrcorrection(had_virus['p'], is_sorted=True)
had_virus['P_CORR'] = p_corr
had_virus['REJECTED'] = rejected

In [None]:
#Checking results
had_virus.head()

In [None]:
#Making sure we have data in both directions
prior_list = had_virus['prior_name']
outcome_list = had_virus['outcome_name']

a_set = set(prior_list)
b_set = set(outcome_list)
if (a_set & b_set):
    both = (a_set & b_set)
else:
    print("No common elements")

both_lists = list(both)
both_lists.remove(NDD_code)

#Print list of viral codes for which we have data in both directions
print(len(both_lists))
print(both_lists)

In [None]:
#Only keep viral codes where we have data in both directions 
had_virus_before = had_virus[had_virus['prior_name'].isin(both_lists)]
had_virus_after = had_virus[had_virus['outcome_name'].isin(both_lists)]
had_virus = pd.concat([had_virus_before, had_virus_after])
print(len(had_virus))
had_virus.head()

In [None]:
#Save to csv
had_virus.to_csv("/PATH/TO/regression_results/" + ndd + "_finn_gen_BOTH.csv", index=False)