# UniRef90 - EMBL mapping Dataset Analysis


In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

UNIREF90_ID = "uniref90_id"
TAX_ID = "tax_id"
SEQUENCE = "seq"
HOST_TAX_IDS = "host_tax_ids"
UNIPROT_HOST_TAX_IDS ="uniprot_host_tax_ids"
EMBL_REF_ID = "embl_ref_id"
EMBL_HOST_NAME ="embl_host_name"

In [2]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [4]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
#    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "virus_name")
    column_stats(df, "virus_host_name")
    return df

### uniref90_viridae_uniprot_hosts

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts.csv")
df = pd.read_csv(file_path, on_bad_lines=None, converters={2: literal_eval},
                              names=[UNIREF90_ID, TAX_ID, HOST_TAX_IDS, EMBL_REF_ID])

In [6]:
df.head()

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id
0,UniRef90_A0A7M1S2J6,2772075,,QOR59620.1
1,UniRef90_A0A291R8P7,2048544,,ATL63196.1
2,UniRef90_D6NXG6,11676,[9606],ADF86138.1
3,UniRef90_A0A159B6D0,1647469,,AKJ72047.1
4,UniRef90_A0A516LML6,2591644,,QDP55163.1


### Records with duplicate EMBL ids

In [7]:
sum(df[EMBL_REF_ID].value_counts() > 1)

12

In [8]:
embl_ref_ids_count = df[EMBL_REF_ID].value_counts()

In [9]:
duplicate_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
duplicate_embl_ids.pop("-")
duplicate_embl_ids.pop("None")
duplicate_embl_ids

CAA25020.1    2
AAA46960.1    2
AAB59912.1    2
CAA25063.1    2
AAA46223.1    2
AAF44394.1    2
AAB61122.1    2
ABC26008.1    2
AAA42673.1    2
AAB63456.1    2
Name: embl_ref_id, dtype: int64

In [10]:
df_duplicate_embl = df[df[EMBL_REF_ID].isin(duplicate_embl_ids.index)]

In [11]:
df_duplicate_embl

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id
297542,UniRef90_O12157,11676,[9606],AAB61122.1
297555,UniRef90_O12158,11676,[9606],AAB61122.1
313237,UniRef90_O40986,10821,"[4498, 217170, 4543, 281129, 4505, 270102, 660...",AAB63456.1
313252,UniRef90_O40987,10821,"[4498, 217170, 4543, 281129, 4505, 270102, 660...",AAB63456.1
353387,UniRef90_P00544,11775,[9681],CAA25063.1
353914,UniRef90_P03113,333754,[9606],CAA25020.1
355597,UniRef90_P03339,11775,[9681],CAA25063.1
360431,UniRef90_P05861,11723,[9527],AAB59912.1
361459,UniRef90_P06421,10586,[9606],AAA46960.1
361499,UniRef90_P06424,10586,[9606],AAA46960.1


### Records with valid EMBL id

In [12]:
invalid_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
embl_pruned_df = df[~df[EMBL_REF_ID].isin(invalid_embl_ids.index)]

In [13]:
embl_pruned_df.shape

(1184006, 4)

## EMBL Mapping 
## uniref90_viridae_embl_hosts.csv

In [14]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts.csv")
df = pd.read_csv(file_path)

In [17]:
sum(df[EMBL_HOST_NAME].isna())

594109

In [18]:
df = df[df[UNIREF90_ID].isin(list(embl_pruned_df[UNIREF90_ID].unique()))]

In [19]:
df.shape

(1184006, 5)

In [26]:
sum(df[EMBL_HOST_NAME].isna())

570798

In [32]:
pruned_df = df[~df[EMBL_HOST_NAME].isna()]
print(f"pruned_df shape = {pruned_df.shape}")
embl_unique_values = df[EMBL_HOST_NAME].value_counts()
column_stats(pruned_df, "tax_id")
column_stats(pruned_df, EMBL_HOST_NAME)

pruned_df shape = (613208, 5)
Number of unique values = 26152
        tax_id  tax_id_count  tax_id_percent
0        11676        216611       35.324229
1      2788787         11063        1.804119
2      3052230         11007        1.794986
3        10407         10241        1.670070
4      2591644          5540        0.903445
5        11646          2910        0.474553
6      2731619          2643        0.431012
7        12721          2097        0.341972
8        11320          2027        0.330557
9      1605721          1563        0.254889
10     2788884          1545        0.251954
11       57667          1413        0.230428
12      985782          1406        0.229286
13     3070911          1381        0.225209
14       10239          1378        0.224720
15     1349410          1357        0.221295
16       11723          1317        0.214772
17       28344          1225        0.199769
18      181083          1182        0.192757
19     2060084          1154        0.

                                          embl_host_name  embl_host_name_count  embl_host_name_percent
0                                       ['Homo sapiens']                255509               41.667591
1                                   ['Acanthamoeba sp.']                  7452                1.215248
2                                   ['Escherichia coli']                  4914                0.801359
3                           ['Acanthamoeba castellanii']                  3952                0.644480
4                             ['Vermamoeba vermiformis']                  3546                0.578270
5                               ['Aeromonas hydrophila']                  3448                0.562289
6                             ['Pseudomonas aeruginosa']                  3216                0.524455
7                              ['Klebsiella pneumoniae']                  2867                0.467541
8                            ['Homo sapiens; sex: male']                 

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv

In [10]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (195, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank          virus_host_name virus_host_taxon_rank                                                seq  seq_len
0      UniRef90_L7QJ56   10245          9606     Vaccinia virus          species             Homo sapiens               species  MPNQNIHQLSEYQTSVSQVAVTHPPKPETPQISEYQDHNELYSASN...       86
1      UniRef90_A2T3M0   28875          9913        Rotavirus A          species               Bos taurus               species  MATFKDACYHYRKISKLNSSILKLGANDEWRPAPITKFKGWCLDCC...      491
2      UniRef90_A2T3N5   28875         60710        Rotavirus A          species  Chlorocebus pygerythrus               species  MLKMESTQQMAVSIINSSFEAAVVAATSALENMGIEYDYQDIYSRV...      315
3  UniRef90_A0A2I6TDN3   10245          9606     Vaccinia virus          species             Homo sapiens               species  MGFCIPLRSKMLKRGSRKSSSILARRPTPKKMNIVTDLENRLKKNS...      100
4      UniRef90_M5AVV3   10407          

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv

In [11]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_multi_host_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (325407, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 50
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 321878                98.915512
1            Pan troglodytes                   2004                 0.615844
2                 Sus scrofa                    617                 0.189609
3                 Bos taurus                    232                 0.071295
4               Capra hircus                    117                 0.035955
5              Procyon lotor                     95                 0.029194
6      Oryctolagus cuniculus                     75                 0.023048
7                 Ovis aries                     49                 0.015058
8     Phascolarctos cinereus                     47                 0.014443
9               Mus musculus                     43                 0.013214
10               Felis catus                     20                 0.006146
11           Cyprinus carpio                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv

In [12]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (45909, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv

In [13]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_single_host_virus.csv")
df = analyze_df(file_path)

df size =  (325248, 9)
       uniref90_id  tax_id  host_tax_ids                      virus_name virus_taxon_rank virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_L0BZH8   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTVKIGGQXXEALLDTGADDTVLEDINLPGKWKPXM...      366
1  UniRef90_L0BZI0   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEDMSLPGRWKPKM...      340
2  UniRef90_L0BZI1   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  ELENEGKISKIGPENPYNTPVFAIKKKNSTKWRKVVDFRELNKRTQ...      199
3  UniRef90_L0BZI3   11676          9606  Human immunodeficiency virus 1          species    Homo sapiens               species  PQITLWQRPIVTIKVGGQLKEALLDTGADDTVLEEMXLPGXWKPKM...      366
4  UniRef90_L0BZI5   11676          9

Number of unique values = 48
             virus_host_name  virus_host_name_count  virus_host_name_percent
0               Homo sapiens                 321877                98.963560
1            Pan troglodytes                   2004                 0.616145
2                 Sus scrofa                    510                 0.156803
3                 Bos taurus                    232                 0.071330
4               Capra hircus                    117                 0.035973
5              Procyon lotor                     95                 0.029208
6      Oryctolagus cuniculus                     75                 0.023059
7     Phascolarctos cinereus                     47                 0.014451
8               Mus musculus                     43                 0.013221
9                 Ovis aries                     25                 0.007686
10           Cyprinus carpio                     18                 0.005534
11            Macaca mulatta                   

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv

In [19]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_wo_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (354, 9)
           uniref90_id  tax_id  host_tax_ids                                   virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0      UniRef90_L7QJ56   10245          9606                               Vaccinia virus          species     Homo sapiens               species  MPNQNIHQLSEYQTSVSQVAVTHPPKPETPQISEYQDHNELYSASN...       86
1  UniRef90_A0A2H4EYA2   10266          9940                               Sheeppox virus          species       Ovis aries               species  MYNNNAFSIGTVLFLIVLIIVIVISLYLLFQLVNCFYLFKFLNKVK...       79
2  UniRef90_A0A2H4F4C4   10266          9940                               Sheeppox virus          species       Ovis aries               species  MKEVYTNFNISLNTTSQKIEVMGSVPTIDGKDPSIDIRIVSKPKKE...      106
3      UniRef90_M1S0W2  180170          8237  Infectious spleen and kidney necrosis virus          species  Thunnus thynnus               species  MLMWCPVTS

### uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv

In [20]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts_pruned_metadata_species_vertebrates_w_seq_wo_single_host_virus_multi_host_seq.csv")
df = analyze_df(file_path)

df size =  (45555, 9)
           uniref90_id  tax_id  host_tax_ids         virus_name virus_taxon_rank  virus_host_name virus_host_taxon_rank                                                seq  seq_len
0  UniRef90_A0A7M3S772   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
1  UniRef90_A0A7M3S772   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MDIDPYKEFGASVELLSFLPNDFYPSVRDLLDTAAALYRDALESPX...      183
2  UniRef90_A0A291S6Q7   10407          9606  Hepatitis B virus          species     Homo sapiens               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
3  UniRef90_A0A291S6Q7   10407          9598  Hepatitis B virus          species  Pan troglodytes               species  MLPVCPLLPGSTTTSTGPCKTCTTLAQGTSMFPSCCCSKPSDGNCT...       91
4  UniRef90_A0A291S6S8   10407          9606  Hepatitis B virus          speci