In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.width', 1000)

In [2]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

## Datasets for virus-host prediction in Non-IDV dataset
Create a dataset of non-idv viruses with hosts that are present in the idv dataset. Else the only overlap with the top-5 hosts in the previous case was "Homo sapiens"


### Non-Immunodeficiency virus dataset WITHOUT cut-off based on host-prevalence

In [4]:
non_idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv.csv")
non_idv_df = pd.read_csv(non_idv_file_path)
non_idv_df.shape

(47792, 11)

In [5]:
column_stats(non_idv_df, "virus_name")

Number of unique values = 3772
                                  virus_name  virus_name_count  virus_name_percent
0                        Hepacivirus hominis             10947           22.905507
1                          Hepatitis B virus             10152           21.242049
2                       Human papillomavirus              1078            2.255608
3                                Rotavirus A               749            1.567208
4                              Riboviria sp.               563            1.178021
...                                      ...               ...                 ...
3767                      Turtle herpesvirus                 1            0.002092
3768  Australian bass nervous necrosis virus                 1            0.002092
3769                     Feline picornavirus                 1            0.002092
3770                         Teschovirus sp.                 1            0.002092
3771                     Megabat bufavirus 2            

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Hepacivirus hominis,10947,22.905507
1,Hepatitis B virus,10152,21.242049
2,Human papillomavirus,1078,2.255608
3,Rotavirus A,749,1.567208
4,Riboviria sp.,563,1.178021
...,...,...,...
3767,Turtle herpesvirus,1,0.002092
3768,Australian bass nervous necrosis virus,1,0.002092
3769,Feline picornavirus,1,0.002092
3770,Teschovirus sp.,1,0.002092


In [6]:
column_stats(non_idv_df, "virus_host_name")

Number of unique values = 1304
                virus_host_name  virus_host_name_count  virus_host_name_percent
0                  Homo sapiens                  28782                60.223468
1                    Sus scrofa                   1286                 2.690827
2     Hydrochoerus hydrochaeris                    627                 1.311935
3            Marmota himalayana                    539                 1.127804
4                 Gallus gallus                    484                 1.012722
...                         ...                    ...                      ...
1299      Tamandua tetradactyla                      1                 0.002092
1300     Trematocara nigrifrons                      1                 0.002092
1301    Lamprologus kungweensis                      1                 0.002092
1302            Ursus maritimus                      1                 0.002092
1303             Alouatta pigra                      1                 0.002092

[1304 ro

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,Homo sapiens,28782,60.223468
1,Sus scrofa,1286,2.690827
2,Hydrochoerus hydrochaeris,627,1.311935
3,Marmota himalayana,539,1.127804
4,Gallus gallus,484,1.012722
...,...,...,...
1299,Tamandua tetradactyla,1,0.002092
1300,Trematocara nigrifrons,1,0.002092
1301,Lamprologus kungweensis,1,0.002092
1302,Ursus maritimus,1,0.002092


In [7]:
non_idv_hosts = set(non_idv_df["virus_host_name"].unique())
non_idv_hosts

{'Emberiza elegans',
 'Arenaria interpres',
 'Clethrionomys glareolus',
 'Cavia porcellus',
 'Andrias davidianus',
 'Potos flavus',
 'Lamprologus ocellatus',
 'Thalassarche chlororhynchos',
 'Apodemus agrarius',
 'Acanthorhynchus tenuirostris',
 'Chilabothrus inornatus',
 'Hipposideros pomona',
 'Crocidura sp.',
 'Limnodynastes ornatus',
 'Gonorynchus abbreviatus',
 'Anolis sagrei',
 'Artibeus planirostris',
 'Chloris chloris',
 'Acerodon celebensis',
 'Myotis laniger',
 'Mephitis mephitis',
 'Papio cynocephalus',
 'Mus musculus',
 'Gerbilliscus leucogaster',
 'Otus scops',
 'Cromileptes altivelis',
 'Aethomys chrysophilus',
 'Salmo salar',
 'Lepus americanus',
 'Parahypsugo crassulus',
 'Nyctimene cephalotes',
 'Leopardus wiedii',
 'Cebus albifrons',
 'Macaca sylvanus',
 'Tamias striatus',
 'Okamejei acutispina',
 'Microtus pennsylvanicus',
 'Colobus guereza',
 'Microcebus murinus',
 'Gallus gallus',
 'Mus musculoides',
 'Mus caroli',
 'Rhinosciurus laticaudatus',
 'Leontopithecus ros

### Immunodeficiency virus dataset

In [8]:
idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv.csv")
idv_df = pd.read_csv(idv_file_path)
idv_df.shape

(220068, 11)

In [9]:
column_stats(idv_df, "virus_name")

Number of unique values = 7
                            virus_name  virus_name_count  virus_name_percent
0       Human immunodeficiency virus 1            215415           97.885654
1         Human immunodeficiency virus              2095            0.951978
2  Simian-Human immunodeficiency virus              1030            0.468037
3       Human immunodeficiency virus 2               788            0.358071
4        Simian immunodeficiency virus               538            0.244470
5        Feline immunodeficiency virus               198            0.089972
6        Bovine immunodeficiency virus                 4            0.001818


Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Human immunodeficiency virus 1,215415,97.885654
1,Human immunodeficiency virus,2095,0.951978
2,Simian-Human immunodeficiency virus,1030,0.468037
3,Human immunodeficiency virus 2,788,0.358071
4,Simian immunodeficiency virus,538,0.24447
5,Feline immunodeficiency virus,198,0.089972
6,Bovine immunodeficiency virus,4,0.001818


In [10]:
idv_column_stats = column_stats(idv_df, "virus_host_name")

Number of unique values = 40
                       virus_host_name  virus_host_name_count  virus_host_name_percent
0                         Homo sapiens                 218631                99.347020
1                       Macaca mulatta                    849                 0.385790
2                          Felis catus                    143                 0.064980
3                      Cercocebus atys                    122                 0.055437
4                        Puma concolor                     24                 0.010906
5                    Macaca nemestrina                     23                 0.010451
6                    Mandrillus sphinx                     23                 0.010451
7               Mandrillus leucophaeus                     20                 0.009088
8              Cercopithecus nictitans                     19                 0.008634
9                Cercopithecus solatus                     17                 0.007725
10            

In [11]:
idv_hosts = set(idv_df["virus_host_name"].unique())
idv_hosts

{'Bos taurus',
 'Cercocebus atys',
 'Cercocebus torquatus',
 'Cercocebus torquatus atys',
 'Cercopithecus aethiops',
 'Cercopithecus aethiops pygerythrus',
 'Cercopithecus ascanius',
 'Cercopithecus cephus',
 'Cercopithecus erythrotis',
 'Cercopithecus lhoesti',
 'Cercopithecus mitis',
 'Cercopithecus mona',
 'Cercopithecus neglectus',
 'Cercopithecus nictitans',
 'Cercopithecus solatus',
 'Chlorocebus pygerythrus',
 'Chlorocebus sabaeus',
 'Chlorocebus tantalus',
 'Colobus guereza',
 'Felis catus',
 'Felis domesticus',
 'Felis silvestris catus',
 'Homo sapiens',
 'Lynx rufus',
 'Macaca arctoides',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Macaca nemestrina',
 'Mandrillus leucophaeus',
 'Mandrillus sphinx',
 'Miopithecus ogouensis',
 'Otocolobus manul',
 'Panthera leo',
 'Panthera pardus',
 'Papio ursinus',
 'Piliocolobus badius',
 'Piliocolobus tephrosceles',
 'Procolobus verus',
 'Puma concolor',
 'Semnopithecus entellus'}

### Intersection between Non-IDV and IDV virus hosts

In [12]:
print(f"non_idv virus hosts = {len(non_idv_hosts)}")
print(f"idv virus hosts = {len(idv_hosts)}")
print(f"non_idv minus idv virus hosts = {len(non_idv_hosts - idv_hosts)}")
print(f"idv minus non_idv virus hosts = {len(idv_hosts - non_idv_hosts)}")
print(f"non_idv intersection idv virus hosts = {len(non_idv_hosts.intersection(idv_hosts))}")
print(f"non_idv union idv virus hosts = {len(non_idv_hosts.union(idv_hosts))}")

non_idv virus hosts = 1304
idv virus hosts = 40
non_idv minus idv virus hosts = 1274
idv minus non_idv virus hosts = 10
non_idv intersection idv virus hosts = 30
non_idv union idv virus hosts = 1314


#### Virus hosts in IDV but NOT in Non-IDV and their prevalence in IDV dataset

In [18]:
idv_column_stats[idv_column_stats["virus_host_name"].isin(list(idv_hosts - non_idv_hosts))]

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
3,Cercocebus atys,122,0.055437
9,Cercopithecus solatus,17,0.007725
10,Cercocebus torquatus,16,0.00727
11,Cercopithecus aethiops,14,0.006362
22,Cercopithecus mitis,6,0.002726
23,Cercopithecus erythrotis,5,0.002272
29,Miopithecus ogouensis,4,0.001818
33,Cercopithecus aethiops pygerythrus,3,0.001363
36,Otocolobus manul,1,0.000454
39,Procolobus verus,1,0.000454


In [19]:
idv_df[idv_df["virus_host_name"].isin(idv_hosts - non_idv_hosts)].to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_unseen_hosts.csv"), index=False)