In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.width', 1000)

In [2]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

## Datasets for virus-host prediction in Non-IDV dataset
Create a dataset of non-idv viruses with hosts that are present in the idv dataset. Else the only overlap with the top-5 hosts in the previous case was "Homo sapiens"


### Non-Immunodeficiency virus dataset WITHOUT cut-off based on host-prevalence

In [4]:
non_idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv.csv")
non_idv_df = pd.read_csv(non_idv_file_path)

In [5]:
column_stats(non_idv_df, "virus_name")

Number of unique values = 3772
                                  virus_name  virus_name_count  virus_name_percent
0                        Hepacivirus hominis             10947           22.905507
1                          Hepatitis B virus             10152           21.242049
2                       Human papillomavirus              1078            2.255608
3                                Rotavirus A               749            1.567208
4                              Riboviria sp.               563            1.178021
...                                      ...               ...                 ...
3767                      Turtle herpesvirus                 1            0.002092
3768  Australian bass nervous necrosis virus                 1            0.002092
3769                     Feline picornavirus                 1            0.002092
3770                         Teschovirus sp.                 1            0.002092
3771                     Megabat bufavirus 2            

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Hepacivirus hominis,10947,22.905507
1,Hepatitis B virus,10152,21.242049
2,Human papillomavirus,1078,2.255608
3,Rotavirus A,749,1.567208
4,Riboviria sp.,563,1.178021
...,...,...,...
3767,Turtle herpesvirus,1,0.002092
3768,Australian bass nervous necrosis virus,1,0.002092
3769,Feline picornavirus,1,0.002092
3770,Teschovirus sp.,1,0.002092


In [6]:
column_stats(non_idv_df, "virus_host_name")

Number of unique values = 1304
                virus_host_name  virus_host_name_count  virus_host_name_percent
0                  Homo sapiens                  28782                60.223468
1                    Sus scrofa                   1286                 2.690827
2     Hydrochoerus hydrochaeris                    627                 1.311935
3            Marmota himalayana                    539                 1.127804
4                 Gallus gallus                    484                 1.012722
...                         ...                    ...                      ...
1299      Tamandua tetradactyla                      1                 0.002092
1300     Trematocara nigrifrons                      1                 0.002092
1301    Lamprologus kungweensis                      1                 0.002092
1302            Ursus maritimus                      1                 0.002092
1303             Alouatta pigra                      1                 0.002092

[1304 ro

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,Homo sapiens,28782,60.223468
1,Sus scrofa,1286,2.690827
2,Hydrochoerus hydrochaeris,627,1.311935
3,Marmota himalayana,539,1.127804
4,Gallus gallus,484,1.012722
...,...,...,...
1299,Tamandua tetradactyla,1,0.002092
1300,Trematocara nigrifrons,1,0.002092
1301,Lamprologus kungweensis,1,0.002092
1302,Ursus maritimus,1,0.002092


In [7]:
non_idv_hosts = set(non_idv_df["virus_host_name"].unique())
non_idv_hosts

{'Neotoma mexicana',
 'Apodemus chevrieri',
 'Thoopterus nigrescens',
 'Trachycephalus resinifictrix',
 'Leuciscus aspius',
 'Tachysurus fulvidraco',
 'Kaloula pulchra',
 'Ichthyophis bannanicus',
 'Boa dumerili',
 'Boulengerochromis microlepis',
 'Cercopithecus lhoesti',
 'Colobus badius',
 'Odorrana tormota',
 'Neoromicia nanus',
 'Miniopterus inflatus',
 'Lepomis macrochirus',
 'Myotis chinensis',
 'Microgale dobsoni',
 'Benthochromis horii',
 'Callosciurus erythraeus',
 'Phocoena spinipinnis',
 'Platycephalus richardsoni',
 'Iberolacerta cyreni',
 'Prunella montanella',
 'Sphyrna lewini',
 'Bradypus variegatus',
 'Agapornis personata',
 'Piliocolobus tephrosceles',
 'Parapercis lutevittata',
 'Ichneumia albicauda',
 'Procyon lotor',
 'Stenella coeruleoalba',
 'Eophona migratoria',
 'Pseudosciaena crocea',
 'Sciurus variegatoides',
 'Sus Scrofa',
 'Vicugna pacos',
 'Cynomys ludovicianus',
 'Duttaphrynus melanostictus',
 'Melogale moschata',
 'Hylopetes phayrei',
 'Rupicapra rupicapr

### Immunodeficiency virus dataset

In [8]:
idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv.csv")
idv_df = pd.read_csv(idv_file_path)

In [9]:
column_stats(idv_df, "virus_name")

Number of unique values = 7
                            virus_name  virus_name_count  virus_name_percent
0       Human immunodeficiency virus 1            215415           97.885654
1         Human immunodeficiency virus              2095            0.951978
2  Simian-Human immunodeficiency virus              1030            0.468037
3       Human immunodeficiency virus 2               788            0.358071
4        Simian immunodeficiency virus               538            0.244470
5        Feline immunodeficiency virus               198            0.089972
6        Bovine immunodeficiency virus                 4            0.001818


Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Human immunodeficiency virus 1,215415,97.885654
1,Human immunodeficiency virus,2095,0.951978
2,Simian-Human immunodeficiency virus,1030,0.468037
3,Human immunodeficiency virus 2,788,0.358071
4,Simian immunodeficiency virus,538,0.24447
5,Feline immunodeficiency virus,198,0.089972
6,Bovine immunodeficiency virus,4,0.001818


In [10]:
idv_column_stats = column_stats(idv_df, "virus_host_name")

Number of unique values = 40
                       virus_host_name  virus_host_name_count  virus_host_name_percent
0                         Homo sapiens                 218631                99.347020
1                       Macaca mulatta                    849                 0.385790
2                          Felis catus                    143                 0.064980
3                      Cercocebus atys                    122                 0.055437
4                        Puma concolor                     24                 0.010906
5                    Macaca nemestrina                     23                 0.010451
6                    Mandrillus sphinx                     23                 0.010451
7               Mandrillus leucophaeus                     20                 0.009088
8              Cercopithecus nictitans                     19                 0.008634
9                Cercopithecus solatus                     17                 0.007725
10            

In [11]:
idv_hosts = set(idv_df["virus_host_name"].unique())
idv_hosts

{'Bos taurus',
 'Cercocebus atys',
 'Cercocebus torquatus',
 'Cercocebus torquatus atys',
 'Cercopithecus aethiops',
 'Cercopithecus aethiops pygerythrus',
 'Cercopithecus ascanius',
 'Cercopithecus cephus',
 'Cercopithecus erythrotis',
 'Cercopithecus lhoesti',
 'Cercopithecus mitis',
 'Cercopithecus mona',
 'Cercopithecus neglectus',
 'Cercopithecus nictitans',
 'Cercopithecus solatus',
 'Chlorocebus pygerythrus',
 'Chlorocebus sabaeus',
 'Chlorocebus tantalus',
 'Colobus guereza',
 'Felis catus',
 'Felis domesticus',
 'Felis silvestris catus',
 'Homo sapiens',
 'Lynx rufus',
 'Macaca arctoides',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Macaca nemestrina',
 'Mandrillus leucophaeus',
 'Mandrillus sphinx',
 'Miopithecus ogouensis',
 'Otocolobus manul',
 'Panthera leo',
 'Panthera pardus',
 'Papio ursinus',
 'Piliocolobus badius',
 'Piliocolobus tephrosceles',
 'Procolobus verus',
 'Puma concolor',
 'Semnopithecus entellus'}

### Intersection between Non-IDV and IDV virus hosts

In [12]:
print(f"non_idv virus hosts = {len(non_idv_hosts)}")
print(f"idv virus hosts = {len(idv_hosts)}")
print(f"non_idv minus idv virus hosts = {len(non_idv_hosts - idv_hosts)}")
print(f"idv minus non_idv virus hosts = {len(idv_hosts - non_idv_hosts)}")
print(f"non_idv intersection idv virus hosts = {len(non_idv_hosts.intersection(idv_hosts))}")
print(f"non_idv union idv virus hosts = {len(non_idv_hosts.union(idv_hosts))}")

non_idv virus hosts = 1304
idv virus hosts = 40
non_idv minus idv virus hosts = 1274
idv minus non_idv virus hosts = 10
non_idv intersection idv virus hosts = 30
non_idv union idv virus hosts = 1314


#### Virus hosts in IDV but NOT in Non-IDV and their prevalence in IDV dataset

In [13]:
idv_column_stats[idv_column_stats["virus_host_name"].isin(list(idv_hosts - non_idv_hosts))]

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
3,Cercocebus atys,122,0.055437
9,Cercopithecus solatus,17,0.007725
10,Cercocebus torquatus,16,0.00727
11,Cercopithecus aethiops,14,0.006362
22,Cercopithecus mitis,6,0.002726
23,Cercopithecus erythrotis,5,0.002272
29,Miopithecus ogouensis,4,0.001818
33,Cercopithecus aethiops pygerythrus,3,0.001363
36,Otocolobus manul,1,0.000454
39,Procolobus verus,1,0.000454


#### Virus hosts in IDV AND in Non-IDV (Intersection); filter

In [14]:
common_hosts = list(non_idv_hosts.intersection(idv_hosts))
non_idv_idv_common_hosts_non_idv_df = non_idv_df[non_idv_df["virus_host_name"].isin(common_hosts)]
non_idv_idv_common_hosts_non_idv_df

Unnamed: 0,uniref90_id,tax_id,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq,seq_len
0,UniRef90_A0A023HQ48,28875,AGO66969.1,['Homo sapiens'],Homo sapiens,Rotavirus A,species,9606,species,KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...,218
1,UniRef90_A0A291S408,147712,ATL74331.1,['Homo sapiens'],Homo sapiens,Rhinovirus B,species,9606,species,ATLPTVPSDNVETRTTYMHYDGTETSLESFLGRAACVHVTTIENKL...,110
4,UniRef90_A0A7M1VN82,2732655,QOS14274.1,['Homo sapiens; male'],Homo sapiens,Vientovirus,species,9606,species,MSLHLDCVNLLLTYNDPVRKDLWGYEPLRPTNYLRISKSMNNIRRC...,348
5,UniRef90_A0A291S4N0,147711,ATL74568.1,['Homo sapiens'],Homo sapiens,Rhinovirus A,species,9606,species,TSNVQPEDTVETRYVQTSQTRDEMSIESFLGRSGCVHISTISIDNS...,103
7,UniRef90_A0A291S4R9,147711,ATL74597.1,['Homo sapiens'],Homo sapiens,Rhinovirus A,species,9606,species,TSNVQPEDTVETRYVQTSQTRDEMSIESFLGRSGCVHISTININEE...,103
...,...,...,...,...,...,...,...,...,...,...,...
47787,UniRef90_A0A158WY96,1803956,AJS09554.1,['Homo sapiens'],Homo sapiens,Parechovirus A,species,9606,species,NSSSPNKVHCIVQGRLGDDAKFFCPTGSLVSFQNSWGSQMDLTDPL...,194
47788,UniRef90_A0A158WYC0,1803956,AJS09545.1,['Homo sapiens'],Homo sapiens,Parechovirus A,species,9606,species,LTSAQDEGPLGGEKPNYFLNFRAINVDIFTVSHTKVDNIFGRAWFS...,116
47789,UniRef90_A0A158WZ44,1803956,AJS09547.1,['Homo sapiens'],Homo sapiens,Parechovirus A,species,9606,species,SSSPSSVHCIVQGRLGDDAKFFCPAGSIVTFQNSWGSQMDLTDPLC...,196
47790,UniRef90_A0A158WZ60,1803956,AJS09567.1,['Homo sapiens'],Homo sapiens,Parechovirus A,species,9606,species,NGNLWRSQLTFPKQGHGMLSQFFAYFTGELNIHILYMASSGFLRVA...,207


In [15]:
# 1% cutoff
filtered_df = filter_with_threshold(non_idv_idv_common_hosts_non_idv_df, "virus_host_name", 1)

Size of df = (29785, 11)
Number of unique values = 30
              virus_host_name  virus_host_name_count  virus_host_name_percent
0                Homo sapiens                  28782                96.632533
1                 Felis catus                    298                 1.000504
2                  Bos taurus                    257                 0.862850
3              Macaca mulatta                    232                 0.778916
4         Macaca fascicularis                     48                 0.161155
5           Mandrillus sphinx                     33                 0.110794
6            Macaca arctoides                     30                 0.100722
7            Felis domesticus                     28                 0.094007
8      Mandrillus leucophaeus                     14                 0.047004
9               Puma concolor                     11                 0.036931
10                 Lynx rufus                     11                 0.036931
11        

In [16]:
# .5% cutoff
non_idv_filtered_df = filter_with_threshold(non_idv_idv_common_hosts_non_idv_df, "virus_host_name", 0.5)

Size of df = (29785, 11)
Number of unique values = 30
              virus_host_name  virus_host_name_count  virus_host_name_percent
0                Homo sapiens                  28782                96.632533
1                 Felis catus                    298                 1.000504
2                  Bos taurus                    257                 0.862850
3              Macaca mulatta                    232                 0.778916
4         Macaca fascicularis                     48                 0.161155
5           Mandrillus sphinx                     33                 0.110794
6            Macaca arctoides                     30                 0.100722
7            Felis domesticus                     28                 0.094007
8      Mandrillus leucophaeus                     14                 0.047004
9               Puma concolor                     11                 0.036931
10                 Lynx rufus                     11                 0.036931
11        

In [17]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/idv_non_idv_common_hosts/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_idv_common_hosts_w_seq_non_idv_t0.005_c4.csv")
non_idv_filtered_df.to_csv(output_file_path, index=False)

In [18]:
non_idv_filtered_hosts = non_idv_filtered_df["virus_host_name"].unique()

In [19]:
idv_df_c4 = idv_df[idv_df["virus_host_name"].isin(non_idv_filtered_hosts)]
idv_df_c4

Unnamed: 0,uniref90_id,tax_id,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq,seq_len
0,UniRef90_D6NXI8,11676,ADF86160.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,MAGRSGDSDEELLRVVRIIKQIYQSNPPPNPEGTRQARRNRRRRWR...,116
1,UniRef90_L0BZL1,11676,AFZ96186.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,EXEKEGKISKIGPENPYNTPIFAIKKKDSTKWRKLVDFRELNKRTQ...,199
2,UniRef90_D6NXI9,11676,ADF86161.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,MAGRSGDSDEXLLRTIRLIRILYQSNPPPSSKGTRQARRNRRRRWR...,94
3,UniRef90_L0BZL4,11676,AFZ95585.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,377
4,UniRef90_D6NXJ0,11676,ADF86162.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,MAGRSGDSDEXLLXTXRLIRXLYQSNPPPNPEGTRQARRNRRRRWR...,116
...,...,...,...,...,...,...,...,...,...,...,...
220063,UniRef90_A0A158V297,11676,AHY02063.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,PQITLWQRPLVSIKIGGQVKEALLDTGADDTVLEEMNLPGKWKPKM...,341
220064,UniRef90_A0A158V2A8,11676,AHY02073.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,341
220065,UniRef90_A0A158V2B2,11676,AHY02083.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,PQITLWQRPLVTIKVGGQLKEALLDTGADDTVLEEMSLPGKWKPKM...,341
220066,UniRef90_A0A158V2B7,11676,AHY02093.1,['Homo sapiens'],Homo sapiens,Human immunodeficiency virus 1,species,9606,species,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMXLPGRWKPKM...,341


In [20]:
column_stats(idv_df_c4, "virus_host_name")

Number of unique values = 4
  virus_host_name  virus_host_name_count  virus_host_name_percent
0    Homo sapiens                 218631                99.546504
1  Macaca mulatta                    849                 0.386564
2     Felis catus                    143                 0.065110
3      Bos taurus                      4                 0.001821


Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,Homo sapiens,218631,99.546504
1,Macaca mulatta,849,0.386564
2,Felis catus,143,0.06511
3,Bos taurus,4,0.001821


In [21]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/idv_non_idv_common_hosts/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_non_idv_common_hosts_w_seq_idv_c4.csv")
idv_df_c4.to_csv(output_file_path, index=False)