# Selecting Viral Codes from the UKB

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

#Directory in Biowulf
os.chdir('/PATH/TO/UKB_Files')

In [2]:
#Load file with all the ICD10 codes and definitions
ICD10 = pd.read_csv("ICD10_coding.tsv", delimiter="\t")
ICD10 = ICD10.drop(['node_id', 'parent_id', 'selectable'], axis=1)

#Load file with codes and IDs
massive_ICD10 = pd.read_csv('massive_ICD10_ALL_table.txt', delimiter='\t', header = None)

In [3]:
#systematic process for finding ICD10 viral codes
#First search for viral and virus; lower or uppercase; add any viral diseases of interest

#Create a lowercast list of all the code descriptions
my_list = ICD10["meaning"].str.lower()

sub = "viral"
ICD10["viral"]= my_list.str.find(sub)
viral = ICD10.loc[ICD10['viral'] != -1]

sub = "virus"
ICD10["virus"]= my_list.str.find(sub)
virus = ICD10.loc[ICD10['virus'] != -1]

sub = "mononucleosis"
ICD10["mononucleosis"]= my_list.str.find(sub)
mono = ICD10.loc[ICD10['mononucleosis'] != -1]

sub = "epstein"
ICD10["epstein"]= my_list.str.find(sub)
epstein = ICD10.loc[ICD10['epstein'] != -1]

sub = "ebv"
ICD10["ebv"]= my_list.str.find(sub)
ebv = ICD10.loc[ICD10['ebv'] != -1]

sub = "encephalitis"
ICD10["encephalitis"]= my_list.str.find(sub)
encephalitis = ICD10.loc[ICD10['encephalitis'] != -1]

sub = "hepatitis"
ICD10["hepatitis"]= my_list.str.find(sub)
hepatitis = ICD10.loc[ICD10['hepatitis'] != -1]

sub = "meningitis"
ICD10["meningitis"]= my_list.str.find(sub)
meningitis = ICD10.loc[ICD10['meningitis'] != -1]

sub = "warts"
ICD10["warts"]= my_list.str.find(sub)
warts = ICD10.loc[ICD10['warts'] != -1]

sub = "influenza"
ICD10["influenza"]= my_list.str.find(sub)
influenza = ICD10.loc[ICD10['influenza'] != -1]

sub = "s_palsy"
ICD10["s_palsy"]= my_list.str.find(sub)
bell = ICD10.loc[ICD10["s_palsy"] != -1]

sub = "chicken"
ICD10["chicken"]= my_list.str.find(sub)
chicken = ICD10.loc[ICD10['chicken'] != -1]

sub = "shingles"
ICD10["shingles"]= my_list.str.find(sub)
shingles = ICD10.loc[ICD10['shingles'] != -1]

sub = "zoster"
ICD10["zoster"]= my_list.str.find(sub)
zoster = ICD10.loc[ICD10['zoster'] != -1]

sub = "measles"
ICD10["measles"]= my_list.str.find(sub)
measles = ICD10.loc[ICD10['measles'] != -1]

sub = "varicella"
ICD10["varicella"]= my_list.str.find(sub)
varicella = ICD10.loc[ICD10['varicella'] != -1]

sub = "herpes"
ICD10["herpes"]= my_list.str.find(sub)
herpes = ICD10.loc[ICD10['herpes'] != -1]

print("Viral codes:", len(viral))
print("Virus codes:", len(virus))
print("Mononucleosis codes: ", len(mono))
print("Epstein-Barr codes: ", len(epstein))
print("EBV codes:", len(ebv))
print("Encephalitis codes:", len(encephalitis))
print("Hepatitis codes:", len(hepatitis))
print("Meningitis codes:", len(meningitis))
print("Warts codes:", len(warts))
print("Influenza codes:", len(influenza))
print("Bell's palsy codes:", len(bell))
print("Chicken pox codes:", len(chicken))
print("varicella codes:", len(varicella))
print("shingles codes:", len(shingles))
print("zoster codes:", len(zoster))
print("measles codes:", len(measles))
print("herpes codes:", len(herpes))

Viral codes: 144
Virus codes: 63
Mononucleosis codes:  5
Epstein-Barr codes:  1
EBV codes: 0
Encephalitis codes: 41
Hepatitis codes: 47
Meningitis codes: 35
Warts codes: 2
Influenza codes: 18
Bell's palsy codes: 1
Chicken pox codes: 1
varicella codes: 6
shingles codes: 0
zoster codes: 9
measles codes: 11
herpes codes: 18


In [4]:
#Combine various lists
total_codes = pd.concat([viral, virus, mono, epstein, encephalitis, hepatitis, meningitis, warts, influenza, bell, chicken, varicella, zoster, herpes])
print("Total codes:", len(total_codes))

#Remove duplicates
total_codes = total_codes.drop_duplicates(subset=['coding'])
print("Without duplicates:", len(total_codes))

Total codes: 391
Without duplicates: 310


In [5]:
#Remove z-codes - "need for immunization" , remove "Antiviral drugs" and "viral vaccines" (y-codes), bacterial, alcoholic
imm_codes = ['Z24', 'Z241', 'Z246', 'Z25', "Z258", 'Y415', 'Y590', "J14", 'K701', 'B963']
virus_codes = total_codes[total_codes.coding.isin(imm_codes) == False]

#Print final number of possible viral codes in the UKB
print("Total unique codes:", len(virus_codes))

Total unique codes: 300


In [6]:
#Create df of codes and meaning
search_terms = virus_codes[['coding','meaning']]
unique_codes = list(search_terms['coding'])
print("Unique codes:", len(unique_codes))

Unique codes: 300


In [7]:
codes_with_samples = []
not_enough_samples = []
number_samples = []

for code in unique_codes:
    #Finding Viral codes in ICD10 list
    viral_ICD10 = massive_ICD10[massive_ICD10[1] == code]
    viral_ICD10 = viral_ICD10.rename(columns = {0: 'ID', 1: "Code"})
    viral_ICD10[code] = 1 
    viral_ICD10 = list(viral_ICD10["ID"])

    #Creating lists of codes with or without enough samples
    if len(viral_ICD10) > 9:
        codes_with_samples.append(code)
        number_samples.append(len(viral_ICD10))
    else:
        not_enough_samples.append(code)

print(f"There are {len(codes_with_samples)} codes with >= 10 samples.")
print(f"We eliminated {len(not_enough_samples)} codes becasue they had less than 10 samples.")

There are 95 codes with >= 10 samples.
We eliminated 205 codes becasue they had less than 10 samples.


In [8]:
#Create df of predictors (codes, meaning, and count)
predictor_df = search_terms[search_terms.coding.isin(codes_with_samples) == True]
predictor_df['N'] = number_samples
predictor_df

Unnamed: 0,coding,meaning,N
61,A080,A08.0_Rotaviral_enteritis,14
64,A083,A08.3_Other_viral_enteritis,38
65,A084,"A08.4_Viral_intestinal_infection,_unspecified",1089
314,A600,A60.0_Herpesviral_infection_of_genitalia_and_u...,40
414,A86,A86_Unspecified_viral_encephalitis,78
...,...,...,...
485,B022,B02.2_Zoster_with_other_nervous_system_involve...,151
486,B023,B02.3_Zoster_ocular_disease,99
488,B028,B02.8_Zoster_with_other_complications,19
489,B029,B02.9_Zoster_without_complication,503


In [9]:
#Create and save csv file for search terms to use in analysis
predictor_df.to_csv("UKB_search_terms.csv", index=False)

In [10]:
#Load and check results
df2 = pd.read_csv("UKB_search_terms.csv")
df2

Unnamed: 0,coding,meaning,N
0,A080,A08.0_Rotaviral_enteritis,14
1,A083,A08.3_Other_viral_enteritis,38
2,A084,"A08.4_Viral_intestinal_infection,_unspecified",1089
3,A600,A60.0_Herpesviral_infection_of_genitalia_and_u...,40
4,A86,A86_Unspecified_viral_encephalitis,78
...,...,...,...
90,B022,B02.2_Zoster_with_other_nervous_system_involve...,151
91,B023,B02.3_Zoster_ocular_disease,99
92,B028,B02.8_Zoster_with_other_complications,19
93,B029,B02.9_Zoster_without_complication,503
