# UniRef90 - UniProt mapping Dataset Analysis


In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

In [2]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [4]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "virus_name")
    column_stats(df, "virus_host_name")
    return df

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq.csv")
df = analyze_df(file_path)

df size =  (371157, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 175
                virus_host_name  virus_host_name_count  virus_host_name_percent
0                  Homo sapiens                 338132                91.102148
1               Pan troglodytes                  16951                 4.567070
2                    Sus scrofa                   2547                 0.686233
3                  Mus musculus                   1251                 0.337054
4        Potamochoerus larvatus                   1044                 0.281283
5        Phacochoerus africanus                   1044                 0.281283
6      Phacochoerus aethiopicus                   1044                 0.281283
7                 Gallus gallus                    834                 0.224703
8          Chlorocebus aethiops                    828                 0.223086
9         Cercopithecus hamlyni                    815                 0.219584
10        Bandicota bengalensis                    812                 0.218775
11        

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv

In [6]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (325602, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 56
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 321964                98.882685
1            Pan troglodytes                   2007                 0.616397
2                 Sus scrofa                    630                 0.193488
3                 Bos taurus                    268                 0.082309
4               Capra hircus                    117                 0.035933
5              Procyon lotor                     95                 0.029177
6      Oryctolagus cuniculus                     77                 0.023649
7               Mus musculus                     55                 0.016892
8                 Ovis aries                     49                 0.015049
9     Phascolarctos cinereus                     47                 0.014435
10            Macaca mulatta                     23                 0.007064
11             Gallus gallus                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv

In [7]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (45555, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv

In [10]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (195, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank          virus_host_name virus_host_taxon_rank                                                seq  seq_len
0      UniRef90_L7QJ56   10245          9606     Vaccinia virus          species             Homo sapiens               species  MPNQNIHQLSEYQTSVSQVAVTHPPKPETPQISEYQDHNELYSASN...       86
1      UniRef90_A2T3M0   28875          9913        Rotavirus A          species               Bos taurus               species  MATFKDACYHYRKISKLNSSILKLGANDEWRPAPITKFKGWCLDCC...      491
2      UniRef90_A2T3N5   28875         60710        Rotavirus A          species  Chlorocebus pygerythrus               species  MLKMESTQQMAVSIINSSFEAAVVAATSALENMGIEYDYQDIYSRV...      315
3  UniRef90_A0A2I6TDN3   10245          9606     Vaccinia virus          species             Homo sapiens               species  MGFCIPLRSKMLKRGSRKSSSILARRPTPKKMNIVTDLENRLKKNS...      100
4      UniRef90_M5AVV3   10407          

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv

In [11]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (325407, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 50
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 321878                98.915512
1            Pan troglodytes                   2004                 0.615844
2                 Sus scrofa                    617                 0.189609
3                 Bos taurus                    232                 0.071295
4               Capra hircus                    117                 0.035955
5              Procyon lotor                     95                 0.029194
6      Oryctolagus cuniculus                     75                 0.023048
7                 Ovis aries                     49                 0.015058
8     Phascolarctos cinereus                     47                 0.014443
9               Mus musculus                     43                 0.013214
10               Felis catus                     20                 0.006146
11           Cyprinus carpio                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv

In [12]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (45909, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv

In [13]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (325248, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 48
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 321877                98.963560
1            Pan troglodytes                   2004                 0.616145
2                 Sus scrofa                    510                 0.156803
3                 Bos taurus                    232                 0.071330
4               Capra hircus                    117                 0.035973
5              Procyon lotor                     95                 0.029208
6      Oryctolagus cuniculus                     75                 0.023059
7     Phascolarctos cinereus                     47                 0.014451
8               Mus musculus                     43                 0.013221
9                 Ovis aries                     25                 0.007686
10           Cyprinus carpio                     18                 0.005534
11            Macaca mulatta                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv

In [19]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (354, 9)
           uniref90_id  tax_id  host_tax_ids                                   virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0      UniRef90_L7QJ56   10245          9606                               Vaccinia virus          species     Homo sapiens               species  MPNQNIHQLSEYQTSVSQVAVTHPPKPETPQISEYQDHNELYSASN...       86
1  UniRef90_A0A2H4EYA2   10266          9940                               Sheeppox virus          species       Ovis aries               species  MYNNNAFSIGTVLFLIVLIIVIVISLYLLFQLVNCFYLFKFLNKVK...       79
2  UniRef90_A0A2H4F4C4   10266          9940                               Sheeppox virus          species       Ovis aries               species  MKEVYTNFNISLNTTSQKIEVMGSVPTIDGKDPSIDIRIVSKPKKE...      106
3      UniRef90_M1S0W2  180170          8237  Infectious spleen and kidney necrosis virus          species  Thunnus thynnus               species  MLMWCPVTS

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv

In [20]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (45555, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci