In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.width', 1000)

In [2]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

## Datasets for virus-host prediction in Non-IDV dataset
Create a dataset of non-idv viruses with hosts that are present in the idv dataset. Else the only overlap with the top-5 hosts in the previous case was "Homo sapiens"


### Non Immunodeficiency Virus, hosts with >= 0.05% prevalence in the dataset

In [4]:
non_idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_gte_0.05_prcnt_prevalence.csv")
non_idv_df = pd.read_csv(non_idv_file_path)
non_idv_df.shape

(42947, 11)

In [5]:
column_stats(non_idv_df, "virus_name")

Number of unique values = 2343
                                 virus_name  virus_name_count  virus_name_percent
0                       Hepacivirus hominis             10939           25.470929
1                         Hepatitis B virus             10126           23.577898
2                      Human papillomavirus              1078            2.510071
3                               Rotavirus A               683            1.590332
4                     Marmot picobirnavirus               529            1.231751
...                                     ...               ...                 ...
2338  Picornavirus fur seal/AAUST34/BR/2012                 1            0.002328
2339   Sakobuvirus fur seal/ATROP22/BR/2012                 1            0.002328
2340   Sakobuvirus fur seal/ATROP16/BR/2012                 1            0.002328
2341   Sakobuvirus fur seal/ATROP14/BR/2012                 1            0.002328
2342                        Hunnivirus 05VZ                 1      

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Hepacivirus hominis,10939,25.470929
1,Hepatitis B virus,10126,23.577898
2,Human papillomavirus,1078,2.510071
3,Rotavirus A,683,1.590332
4,Marmot picobirnavirus,529,1.231751
...,...,...,...
2338,Picornavirus fur seal/AAUST34/BR/2012,1,0.002328
2339,Sakobuvirus fur seal/ATROP22/BR/2012,1,0.002328
2340,Sakobuvirus fur seal/ATROP16/BR/2012,1,0.002328
2341,Sakobuvirus fur seal/ATROP14/BR/2012,1,0.002328


In [6]:
column_stats(non_idv_df, "virus_host_name")

Number of unique values = 148
               virus_host_name  virus_host_name_count  virus_host_name_percent
0                 Homo sapiens                  28782                67.017487
1                   Sus scrofa                   1286                 2.994388
2    Hydrochoerus hydrochaeris                    627                 1.459939
3           Marmota himalayana                    539                 1.255035
4                Gallus gallus                    484                 1.126970
..                         ...                    ...                      ...
143        Eptesicus serotinus                     24                 0.055883
144           Cairina moschata                     24                 0.055883
145                Salmo salar                     24                 0.055883
146              Rattus rattus                     24                 0.055883
147              Anas gracilis                     24                 0.055883

[148 rows x 3 columns

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,Homo sapiens,28782,67.017487
1,Sus scrofa,1286,2.994388
2,Hydrochoerus hydrochaeris,627,1.459939
3,Marmota himalayana,539,1.255035
4,Gallus gallus,484,1.126970
...,...,...,...
143,Eptesicus serotinus,24,0.055883
144,Cairina moschata,24,0.055883
145,Salmo salar,24,0.055883
146,Rattus rattus,24,0.055883


In [7]:
non_idv_hosts = set(non_idv_df["virus_host_name"].unique())
np.savetxt(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_non_idv_hosts.csv"),
          non_idv_df["virus_host_name"].unique(),
          fmt="%s")
print(len(non_idv_hosts))
non_idv_hosts

148


{'Acanthorhynchus tenuirostris',
 'Acinonyx jubatus',
 'Aegithalos caudatus',
 'Ailuropoda melanoleuca',
 'Ailurus fulgens',
 'Alces alces',
 'Amazona oratrix',
 'Ambystoma tigrinum',
 'Anas castanea',
 'Anas gracilis',
 'Anas platyrhynchos',
 'Anas superciliosa',
 'Anguilla australis',
 'Apodemus agrarius',
 'Apodemus flavicollis',
 'Arctocephalus australis',
 'Arctocephalus tropicalis',
 'Boa constrictor',
 'Bos taurus',
 'Bufo bufo',
 'Cairina moschata',
 'Callorhinus ursinus',
 'Camelus dromedarius',
 'Capra hircus',
 'Cavia porcellus',
 'Cecropis daurica',
 'Chelonia mydas',
 'Chenonetta jubata',
 'Coendou prehensilis',
 'Corynorhinus rafinesquii',
 'Crocodilurus amazonicus',
 'Crocodylus porosus',
 'Cygnus columbianus',
 'Cyprinus carpio',
 'Dendrocopos leucotos',
 'Desmodus rotundus',
 'Dipodomys merriami',
 'Eidolon helvum',
 'Elephas maximus',
 'Emberiza chrysophrys',
 'Emberiza spodocephala',
 'Enhydra lutris',
 'Eptesicus fuscus',
 'Eptesicus serotinus',
 'Equus asinus',
 'E

### Immunodeficiency virus dataset

In [8]:
idv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv.csv")
idv_df = pd.read_csv(idv_file_path)
idv_df.shape

(220068, 11)

In [9]:
column_stats(idv_df, "virus_name")

Number of unique values = 7
                            virus_name  virus_name_count  virus_name_percent
0       Human immunodeficiency virus 1            215415           97.885654
1         Human immunodeficiency virus              2095            0.951978
2  Simian-Human immunodeficiency virus              1030            0.468037
3       Human immunodeficiency virus 2               788            0.358071
4        Simian immunodeficiency virus               538            0.244470
5        Feline immunodeficiency virus               198            0.089972
6        Bovine immunodeficiency virus                 4            0.001818


Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Human immunodeficiency virus 1,215415,97.885654
1,Human immunodeficiency virus,2095,0.951978
2,Simian-Human immunodeficiency virus,1030,0.468037
3,Human immunodeficiency virus 2,788,0.358071
4,Simian immunodeficiency virus,538,0.24447
5,Feline immunodeficiency virus,198,0.089972
6,Bovine immunodeficiency virus,4,0.001818


In [10]:
idv_column_stats = column_stats(idv_df, "virus_host_name")

Number of unique values = 40
                       virus_host_name  virus_host_name_count  virus_host_name_percent
0                         Homo sapiens                 218631                99.347020
1                       Macaca mulatta                    849                 0.385790
2                          Felis catus                    143                 0.064980
3                      Cercocebus atys                    122                 0.055437
4                        Puma concolor                     24                 0.010906
5                    Macaca nemestrina                     23                 0.010451
6                    Mandrillus sphinx                     23                 0.010451
7               Mandrillus leucophaeus                     20                 0.009088
8              Cercopithecus nictitans                     19                 0.008634
9                Cercopithecus solatus                     17                 0.007725
10            

In [11]:
idv_hosts = set(idv_df["virus_host_name"].unique())
np.savetxt(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_idv_hosts.csv"),
           idv_df["virus_host_name"].unique(), 
           fmt="%s")
print(len(idv_hosts))
idv_hosts

40


{'Bos taurus',
 'Cercocebus atys',
 'Cercocebus torquatus',
 'Cercocebus torquatus atys',
 'Cercopithecus aethiops',
 'Cercopithecus aethiops pygerythrus',
 'Cercopithecus ascanius',
 'Cercopithecus cephus',
 'Cercopithecus erythrotis',
 'Cercopithecus lhoesti',
 'Cercopithecus mitis',
 'Cercopithecus mona',
 'Cercopithecus neglectus',
 'Cercopithecus nictitans',
 'Cercopithecus solatus',
 'Chlorocebus pygerythrus',
 'Chlorocebus sabaeus',
 'Chlorocebus tantalus',
 'Colobus guereza',
 'Felis catus',
 'Felis domesticus',
 'Felis silvestris catus',
 'Homo sapiens',
 'Lynx rufus',
 'Macaca arctoides',
 'Macaca fascicularis',
 'Macaca mulatta',
 'Macaca nemestrina',
 'Mandrillus leucophaeus',
 'Mandrillus sphinx',
 'Miopithecus ogouensis',
 'Otocolobus manul',
 'Panthera leo',
 'Panthera pardus',
 'Papio ursinus',
 'Piliocolobus badius',
 'Piliocolobus tephrosceles',
 'Procolobus verus',
 'Puma concolor',
 'Semnopithecus entellus'}

### Intersection between Non-IDV and IDV virus hosts

In [12]:
print(f"non_idv virus hosts = {len(non_idv_hosts)}")
print(f"idv virus hosts = {len(idv_hosts)}")
print(f"non_idv minus idv virus hosts = {len(non_idv_hosts - idv_hosts)}")
print(f"idv minus non_idv virus hosts = {len(idv_hosts - non_idv_hosts)}")
print(f"non_idv intersection idv virus hosts = {len(non_idv_hosts.intersection(idv_hosts))}")
print(f"non_idv union idv virus hosts = {len(non_idv_hosts.union(idv_hosts))}")

non_idv virus hosts = 148
idv virus hosts = 40
non_idv minus idv virus hosts = 140
idv minus non_idv virus hosts = 32
non_idv intersection idv virus hosts = 8
non_idv union idv virus hosts = 180


#### Unseen virus, seen hosts: Virus hosts in IDV and in Non-IDV and their prevalence in IDV dataset

In [13]:
idv_column_stats[idv_column_stats["virus_host_name"].isin(list(non_idv_hosts.intersection(idv_hosts)))]

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,Homo sapiens,218631,99.34702
1,Macaca mulatta,849,0.38579
2,Felis catus,143,0.06498
6,Mandrillus sphinx,23,0.010451
24,Felis domesticus,5,0.002272
25,Macaca arctoides,5,0.002272
26,Macaca fascicularis,5,0.002272
28,Bos taurus,4,0.001818


In [29]:
sub_idv_df = idv_df[idv_df["virus_host_name"].isin(non_idv_hosts.intersection(idv_hosts))]
print(sub_idv_df.shape)
sub_idv_df.to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_seen_hosts.csv"), index=False)

(219665, 11)


In [30]:
combined_df = pd.concat([sub_idv_df, non_idv_df])
print(combined_df.shape)
combined_df.to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_seen_hosts_w_non_idv.csv"), index=False)

(262612, 11)


In [31]:
# WITHOUT HUMAN - ONLY FOR TESTING
print(combined_df[combined_df["virus_host_name"] != "Homo sapiens"].shape)
combined_df[combined_df["virus_host_name"] != "Homo sapiens"].to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_seen_hosts_w_non_idv_wo_human.csv"), index=False)

(15199, 11)


In [32]:
np.percentile(combined_df[combined_df["virus_host_name"] != "Homo sapiens"]["seq_len"].values, [90, 95, 99])

array([ 700.  ,  934.  , 2337.02])

In [33]:
combined_df_wo_human = combined_df[combined_df["virus_host_name"] != "Homo sapiens"]
print(combined_df_wo_human[combined_df_wo_human["seq_len"] <= 2337.02].shape)
combined_df_wo_human[combined_df_wo_human["seq_len"] <= 2337.02].to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_seen_hosts_w_non_idv_wo_human_seqlen_lte_99prcntile.csv"), index=False)

(15047, 11)


#### Unseen virus, unseen hosts: Virus hosts in IDV but NOT in Non-IDV and their prevalence in IDV dataset

In [20]:
idv_column_stats[idv_column_stats["virus_host_name"].isin(list(idv_hosts - non_idv_hosts))]

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
3,Cercocebus atys,122,0.055437
4,Puma concolor,24,0.010906
5,Macaca nemestrina,23,0.010451
7,Mandrillus leucophaeus,20,0.009088
8,Cercopithecus nictitans,19,0.008634
9,Cercopithecus solatus,17,0.007725
10,Cercocebus torquatus,16,0.00727
11,Cercopithecus aethiops,14,0.006362
12,Cercopithecus cephus,13,0.005907
13,Chlorocebus sabaeus,13,0.005907


In [21]:
sub_idv_df = idv_df[idv_df["virus_host_name"].isin(idv_hosts - non_idv_hosts)]
print(sub_idv_df.shape)
sub_idv_df.to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_unseen_hosts.csv"), index=False)

(403, 11)


In [22]:
combined_df = pd.concat([sub_idv_df, non_idv_df])
print(combined_df.shape)

(43350, 11)


In [24]:
combined_df.to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_unseen_hosts_w_non_idv.csv"), index=False)

In [25]:
np.percentile(combined_df["seq_len"].values, [90, 95, 99])

array([ 591.,  846., 3009.])

In [26]:
combined_df[combined_df["seq_len"] <= 3009].to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv_unseen_hosts_w_non_idv_seqlen_lte_99prcntile.csv"), index=False)