# UniRef90 - UniProt mapping Dataset Analysis


In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

In [2]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [4]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "virus_name")
    column_stats(df, "virus_host_name")

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq.csv")
analyze_df(file_path)

df size =  (371157, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 175
                virus_host_name  virus_host_name_count  virus_host_name_percent
0                  Homo sapiens                 338132                91.102148
1               Pan troglodytes                  16951                 4.567070
2                    Sus scrofa                   2547                 0.686233
3                  Mus musculus                   1251                 0.337054
4        Potamochoerus larvatus                   1044                 0.281283
5        Phacochoerus africanus                   1044                 0.281283
6      Phacochoerus aethiopicus                   1044                 0.281283
7                 Gallus gallus                    834                 0.224703
8          Chlorocebus aethiops                    828                 0.223086
9         Cercopithecus hamlyni                    815                 0.219584
10        Bandicota bengalensis                    812                 0.218775
11        

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv

In [6]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq.csv")
analyze_df(file_path)

df size =  (343108, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 90
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 336408                98.047262
1            Pan troglodytes                   2007                 0.584947
2   Phacochoerus aethiopicus                   1044                 0.304277
3      Bandicota bengalensis                    812                 0.236660
4                 Sus scrofa                    630                 0.183616
5               Capra hircus                    448                 0.130571
6                 Bos taurus                    309                 0.090059
7        Apodemus sylvaticus                    279                 0.081316
8              Procyon lotor                     95                 0.027688
9          Cynomys gunnisoni                     89                 0.025939
10     Oryctolagus cuniculus                     77                 0.022442
11     Dryophytes versicolor                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv

In [7]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_multi_host_seq.csv")
analyze_df(file_path)

df size =  (0, 9)
Empty DataFrame
Columns: [uniref90_id, tax_id, host_tax_ids, virus_name, virus_taxon_rank, virus_host_name, virus_host_taxon_rank, seq, seq_len]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_name, virus_name_count, virus_name_percent]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_host_name, virus_host_name_count, virus_host_name_percent]
Index: []


### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_w_single_host_virus.csv

In [8]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv")
analyze_df(file_path)

df size =  (45909, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_w_single_host_virus.csv

In [9]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv")
analyze_df(file_path)

df size =  (0, 9)
Empty DataFrame
Columns: [uniref90_id, tax_id, host_tax_ids, virus_name, virus_taxon_rank, virus_host_name, virus_host_taxon_rank, seq, seq_len]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_name, virus_name_count, virus_name_percent]
Index: []
Number of unique values = 0
Empty DataFrame
Columns: [virus_host_name, virus_host_name_count, virus_host_name_percent]
Index: []
