# Selecting Viral Search Terms from FinnGen

In [None]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

#Change directory to files
os.chdir('/PATH/TO/FinnGen_Files')

In [None]:
#Loading all the disease files
#Files downloaded from FinnGen on 5-5-22
AD_finn = pd.read_csv('G6_ALZHEIMER_survival-analyses.csv')
ALS_finn = pd.read_csv('G6_ALS_survival-analyses.csv')
dementia_finn = pd.read_csv('F5_DEMENTIA_survival-analyses.csv')
MS_finn = pd.read_csv('G6_MS_survival-analyses.csv')
PD_finn = pd.read_csv('G6_PARKINSON_survival-analyses.csv')
vascular_finn = pd.read_csv('F5_VASCDEM_survival-analyses.csv')

#Load file with all the FinnGen Endpoints -- download from FinnGen
finngen = pd.read_csv("finngen_endpoints.tsv", delimiter="\t")
print(len(finngen))

In [None]:
#Concat individual files together
df = pd.concat([AD_finn, ALS_finn, dementia_finn, MS_finn, PD_finn, vascular_finn], ignore_index=True)
print(len(df))

prior_list = list(df['prior_name'])
prior_list = set(prior_list)
print("Number of endpoints with NDD data:", len(prior_list))

In [None]:
#from all the endpoints, select the categories we have data for
finngen_data = finngen[finngen.phenocode.isin(prior_list)==True]
print(len(finngen_data))

In [None]:
#Create a lowercast list of all the code descriptions
my_list = finngen_data["phenotype"].str.lower()

#List of all viral-related keywords
virus = ['viral', 'virus', 'mononucleosis', 'epstein', 'ebv', 'encephalitis', 'hepatitis', 'meningitis', 'warts',
        'influenza', "bell's_palsy", 's palsy', 'chicken', 'shingles', 'zoster', 'measles', 'varicella', 'herpes', 'myocarditis', 
         'erythema multiforme', "subacute thyroiditis", 'cold']

#Create empty df
df = pd.DataFrame()

# Add each virus to df
for sub in virus:
    finngen_data[sub]= my_list.str.find(sub)
    df1 = finngen_data.loc[finngen_data[sub] != -1]
    print(sub, 'codes:', len(df1))
    df = df.append(df1)

In [None]:
#Remove duplicates
total_codes = df.drop_duplicates(subset=['phenocode'])
print("Without duplicates:", len(total_codes))

In [None]:
#Create df of codes, meaning, number of cases
search_terms = total_codes[['phenocode','phenotype', 'number of cases']]

print(len(search_terms))
search_terms

In [None]:
#Create and save csv file for search terms to use in analysis
search_terms.to_csv("FinnGen_search_terms.csv", index=False)