# UniRef90 - Virus Host DB mapping Dataset Analysis


In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

In [2]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [4]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "virus_name")
    column_stats(df, "virus_host_name")
    return df

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq.csv")
df = analyze_df(file_path)

df size =  (459807, 9)
           uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank       virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HI16   11676          9606  Human immunodeficiency virus 1          species          Homo sapiens               species  MLPLETIGAIIALVIAGIIAIVVWTIVYIEYRKLLRQKKIDRLIDR...       82
1  UniRef90_A0A023HI18   11676          9606  Human immunodeficiency virus 1          species          Homo sapiens               species  MRVKGIRKNCPHLWRWGTMLLGMLMICSAAEQLWVTVYYGVPVWKE...      851
2  UniRef90_A0A023HIB6   11676          9606  Human immunodeficiency virus 1          species          Homo sapiens               species  MEPVDPNLEPWNHPGSKPTTACSKCYCKKCCWHCQLCFLKKGLGIS...      101
3  UniRef90_A0A023HQ48   28875          9534                     Rotavirus A          species  Chlorocebus aethiops               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv

In [15]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (364415, 9)
           uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HI16   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MLPLETIGAIIALVIAGIIAIVVWTIVYIEYRKLLRQKKIDRLIDR...       82
1  UniRef90_A0A023HI18   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MRVKGIRKNCPHLWRWGTMLLGMLMICSAAEQLWVTVYYGVPVWKE...      851
2  UniRef90_A0A023HIB6   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MEPVDPNLEPWNHPGSKPTTACSKCYCKKCCWHCQLCFLKKGLGIS...      101
3  UniRef90_A0A023I5A1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  TVKIGGQLKEALLDTGADDTVLEDINLPGKWKPKMIGGIGGFIKVK...      319
4  UniRef90_A0A02

Number of unique values = 271
                 virus_host_name  virus_host_name_count  virus_host_name_percent
0                   Homo sapiens                 350792                96.261680
1                 Paguma larvata                   1573                 0.431651
2                     Sus scrofa                   1287                 0.353169
3                    Felis catus                    748                 0.205260
4      Hydrochoerus hydrochaeris                    723                 0.198400
5                     Bos taurus                    697                 0.191265
6                  Gallus gallus                    546                 0.149829
7              Gopherus morafkai                    369                 0.101258
8           Marmota flaviventris                    369                 0.101258
9             Petroica australis                    362                 0.099337
10                  Mus musculus                    330                 0.09055

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv

In [17]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (95392, 9)
           uniref90_id  tax_id  host_tax_ids                         virus_name virus_taxon_rank          virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HQ48   28875          9534                        Rotavirus A          species     Chlorocebus aethiops               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
1  UniRef90_A0A023HQ48   28875          9606                        Rotavirus A          species             Homo sapiens               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
2  UniRef90_A0A023HQ48   28875          9913                        Rotavirus A          species               Bos taurus               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
3  UniRef90_A0A023HQ48   28875         60710                        Rotavirus A          species  Chlorocebus pygerythrus               species  KGWPTGSVYFKEYSSIV

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv

In [11]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (0, 9)
Empty DataFrame
Columns: [uniref90_id, tax_id, host_tax_ids, virus_name, virus_taxon_rank, virus_host_name, virus_host_taxon_rank, seq, seq_len]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_name, virus_name_count, virus_name_percent]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_host_name, virus_host_name_count, virus_host_name_percent]
Index: []


### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv

In [12]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (364415, 9)
           uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HI16   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MLPLETIGAIIALVIAGIIAIVVWTIVYIEYRKLLRQKKIDRLIDR...       82
1  UniRef90_A0A023HI18   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MRVKGIRKNCPHLWRWGTMLLGMLMICSAAEQLWVTVYYGVPVWKE...      851
2  UniRef90_A0A023HIB6   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MEPVDPNLEPWNHPGSKPTTACSKCYCKKCCWHCQLCFLKKGLGIS...      101
3  UniRef90_A0A023I5A1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  TVKIGGQLKEALLDTGADDTVLEDINLPGKWKPKMIGGIGGFIKVK...      319
4  UniRef90_A0A02

Number of unique values = 271
                 virus_host_name  virus_host_name_count  virus_host_name_percent
0                   Homo sapiens                 350792                96.261680
1                 Paguma larvata                   1573                 0.431651
2                     Sus scrofa                   1287                 0.353169
3                    Felis catus                    748                 0.205260
4      Hydrochoerus hydrochaeris                    723                 0.198400
5                     Bos taurus                    697                 0.191265
6                  Gallus gallus                    546                 0.149829
7              Gopherus morafkai                    369                 0.101258
8           Marmota flaviventris                    369                 0.101258
9             Petroica australis                    362                 0.099337
10                  Mus musculus                    330                 0.09055

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv

In [13]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (95392, 9)
           uniref90_id  tax_id  host_tax_ids                         virus_name virus_taxon_rank          virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HQ48   28875          9534                        Rotavirus A          species     Chlorocebus aethiops               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
1  UniRef90_A0A023HQ48   28875          9606                        Rotavirus A          species             Homo sapiens               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
2  UniRef90_A0A023HQ48   28875          9913                        Rotavirus A          species               Bos taurus               species  KGWPTGSVYFKEYSSIVDFSVDPQLYCDYNLVLMKYDQNLELDMSE...      218
3  UniRef90_A0A023HQ48   28875         60710                        Rotavirus A          species  Chlorocebus pygerythrus               species  KGWPTGSVYFKEYSSIV

### uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv

In [14]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_virushostdb_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (364415, 9)
           uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A023HI16   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MLPLETIGAIIALVIAGIIAIVVWTIVYIEYRKLLRQKKIDRLIDR...       82
1  UniRef90_A0A023HI18   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MRVKGIRKNCPHLWRWGTMLLGMLMICSAAEQLWVTVYYGVPVWKE...      851
2  UniRef90_A0A023HIB6   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  MEPVDPNLEPWNHPGSKPTTACSKCYCKKCCWHCQLCFLKKGLGIS...      101
3  UniRef90_A0A023I5A1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  TVKIGGQLKEALLDTGADDTVLEDINLPGKWKPKMIGGIGGFIKVK...      319
4  UniRef90_A0A02

Number of unique values = 271
                 virus_host_name  virus_host_name_count  virus_host_name_percent
0                   Homo sapiens                 350792                96.261680
1                 Paguma larvata                   1573                 0.431651
2                     Sus scrofa                   1287                 0.353169
3                    Felis catus                    748                 0.205260
4      Hydrochoerus hydrochaeris                    723                 0.198400
5                     Bos taurus                    697                 0.191265
6                  Gallus gallus                    546                 0.149829
7              Gopherus morafkai                    369                 0.101258
8           Marmota flaviventris                    369                 0.101258
9             Petroica australis                    362                 0.099337
10                  Mus musculus                    330                 0.09055

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv

In [19]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (354, 9)
           uniref90_id  tax_id  host_tax_ids                                   virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0      UniRef90_L7QJ56   10245          9606                               Vaccinia virus          species     Homo sapiens               species  MPNQNIHQLSEYQTSVSQVAVTHPPKPETPQISEYQDHNELYSASN...       86
1  UniRef90_A0A2H4EYA2   10266          9940                               Sheeppox virus          species       Ovis aries               species  MYNNNAFSIGTVLFLIVLIIVIVISLYLLFQLVNCFYLFKFLNKVK...       79
2  UniRef90_A0A2H4F4C4   10266          9940                               Sheeppox virus          species       Ovis aries               species  MKEVYTNFNISLNTTSQKIEVMGSVPTIDGKDPSIDIRIVSKPKKE...      106
3      UniRef90_M1S0W2  180170          8237  Infectious spleen and kidney necrosis virus          species  Thunnus thynnus               species  MLMWCPVTS

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv

In [20]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (45555, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci