# UniRef90 - EMBL mapping Dataset Analysis


In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

UNIREF90_ID = "uniref90_id"
TAX_ID = "tax_id"
SEQUENCE = "seq"
HOST_TAX_IDS = "host_tax_ids"
UNIPROT_HOST_TAX_IDS ="uniprot_host_tax_ids"
EMBL_REF_ID = "embl_ref_id"
EMBL_HOST_NAME ="embl_host_name"

In [2]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [18]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
#    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "tax_id")
    column_stats(df, "embl_host_name")
    return df

### uniref90_viridae_uniprot_hosts

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_uniprot_hosts.csv")
df = pd.read_csv(file_path, on_bad_lines=None, converters={2: literal_eval},
                              names=[UNIREF90_ID, TAX_ID, HOST_TAX_IDS, EMBL_REF_ID])

In [6]:
df.head()

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id
0,UniRef90_A0A7M1S2J6,2772075,,QOR59620.1
1,UniRef90_A0A291R8P7,2048544,,ATL63196.1
2,UniRef90_D6NXG6,11676,[9606],ADF86138.1
3,UniRef90_A0A159B6D0,1647469,,AKJ72047.1
4,UniRef90_A0A516LML6,2591644,,QDP55163.1


### Records with duplicate EMBL ids

In [7]:
sum(df[EMBL_REF_ID].value_counts() > 1)

12

In [8]:
embl_ref_ids_count = df[EMBL_REF_ID].value_counts()

In [9]:
duplicate_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
duplicate_embl_ids.pop("-")
duplicate_embl_ids.pop("None")
duplicate_embl_ids

CAA25020.1    2
AAA46960.1    2
AAB59912.1    2
CAA25063.1    2
AAA46223.1    2
AAF44394.1    2
AAB61122.1    2
ABC26008.1    2
AAA42673.1    2
AAB63456.1    2
Name: embl_ref_id, dtype: int64

In [10]:
df_duplicate_embl = df[df[EMBL_REF_ID].isin(duplicate_embl_ids.index)]

In [11]:
df_duplicate_embl.shape

(20, 4)

In [12]:
df_duplicate_embl

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id
297542,UniRef90_O12157,11676,[9606],AAB61122.1
297555,UniRef90_O12158,11676,[9606],AAB61122.1
313237,UniRef90_O40986,10821,"[4498, 217170, 4543, 281129, 4505, 270102, 660...",AAB63456.1
313252,UniRef90_O40987,10821,"[4498, 217170, 4543, 281129, 4505, 270102, 660...",AAB63456.1
353387,UniRef90_P00544,11775,[9681],CAA25063.1
353914,UniRef90_P03113,333754,[9606],CAA25020.1
355597,UniRef90_P03339,11775,[9681],CAA25063.1
360431,UniRef90_P05861,11723,[9527],AAB59912.1
361459,UniRef90_P06421,10586,[9606],AAA46960.1
361499,UniRef90_P06424,10586,[9606],AAA46960.1


### Records with valid EMBL id

In [13]:
invalid_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
embl_pruned_df = df[~df[EMBL_REF_ID].isin(invalid_embl_ids.index)]

In [14]:
embl_pruned_df.shape

(1184006, 4)

## EMBL Mapping 
### uniref90_viridae_embl_hosts_pruned.csv

In [19]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned.csv")
df = analyze_df(file_path)

df size =  (613208, 5)
           uniref90_id   tax_id uniprot_host_tax_ids embl_ref_id                           embl_host_name
0  UniRef90_A0A159B6I9  1647470                  NaN  AKJ72124.1      ['Gordonia malaquae strain BEN700']
1  UniRef90_A0A386AT53   151340                  NaN  AYC54495.1                         ['Homo sapiens']
2      UniRef90_D6NXI8    11676               [9606]  ADF86160.1                         ['Homo sapiens']
3  UniRef90_A0A023HQ45    10941                  NaN  AGO66954.2  ['Homo sapiens; sex: M; age: 7 months']
4  UniRef90_A0A9Y1HTB6  2968691                  NaN  WAX17359.1         ['Parabacteroides merdae BSC93']
Number of unique values = 26152
        tax_id  tax_id_count  tax_id_percent
0        11676        216611       35.324229
1      2788787         11063        1.804119
2      3052230         11007        1.794986
3        10407         10241        1.670070
4      2591644          5540        0.903445
5        11646          2910        0.4

                                          embl_host_name  embl_host_name_count  embl_host_name_percent
0                                       ['Homo sapiens']                255509               41.667591
1                                   ['Acanthamoeba sp.']                  7452                1.215248
2                                   ['Escherichia coli']                  4914                0.801359
3                           ['Acanthamoeba castellanii']                  3952                0.644480
4                             ['Vermamoeba vermiformis']                  3546                0.578270
5                               ['Aeromonas hydrophila']                  3448                0.562289
6                             ['Pseudomonas aeruginosa']                  3216                0.524455
7                              ['Klebsiella pneumoniae']                  2867                0.467541
8                            ['Homo sapiens; sex: male']                 

### uniref90_viridae_embl_hosts_pruned_metadata_species.csv

In [21]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species.csv")
df = analyze_df(file_path)

df size =  (348401, 9)
           uniref90_id  tax_id embl_ref_id    embl_host_name virus_host_name                      virus_name virus_taxon_rank  virus_host_tax_id virus_host_taxon_rank
0      UniRef90_D6NXI8   11676  ADF86160.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
1      UniRef90_L0BZL1   11676  AFZ96186.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
2      UniRef90_D6NXI9   11676  ADF86161.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
3  UniRef90_A0A023HQ48   28875  AGO66969.1  ['Homo sapiens']    Homo sapiens                     Rotavirus A          species               9606               species
4      UniRef90_L0BZL4   11676  AFZ95585.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               96

Number of unique values = 5488
                                         embl_host_name  embl_host_name_count  embl_host_name_percent
0                                      ['Homo sapiens']                236110               67.769610
1                                  ['Acanthamoeba sp.']                  5746                1.649249
2                           ['Homo sapiens; sex: male']                  2830                0.812282
3                            ['Acanthamoeba polyphaga']                  2395                0.687426
4                            ['Vermamoeba vermiformis']                  2160                0.619975
5                                  ['Escherichia coli']                  1851                0.531284
6                              ['Aeromonas hydrophila']                  1451                0.416474
7                          ['Acanthamoeba castellanii']                  1415                0.406141
8                         ['Homo sapiens; sex: fema

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates.csv

In [23]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates.csv")
df = analyze_df(file_path)

df size =  (267865, 9)
           uniref90_id  tax_id embl_ref_id    embl_host_name virus_host_name                      virus_name virus_taxon_rank  virus_host_tax_id virus_host_taxon_rank
0      UniRef90_D6NXI8   11676  ADF86160.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
1      UniRef90_L0BZL1   11676  AFZ96186.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
2      UniRef90_D6NXI9   11676  ADF86161.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               9606               species
3  UniRef90_A0A023HQ48   28875  AGO66969.1  ['Homo sapiens']    Homo sapiens                     Rotavirus A          species               9606               species
4      UniRef90_L0BZL4   11676  AFZ95585.1  ['Homo sapiens']    Homo sapiens  Human immunodeficiency virus 1          species               96

In [24]:
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")

Number of unique values = 3779
                                             virus_name  virus_name_count  virus_name_percent
0                        Human immunodeficiency virus 1            215415           80.419241
1                                   Hepacivirus hominis             10947            4.086760
2                                     Hepatitis B virus             10152            3.789969
3                          Human immunodeficiency virus              2095            0.782110
4                                  Human papillomavirus              1078            0.402442
5                   Simian-Human immunodeficiency virus              1030            0.384522
6                        Human immunodeficiency virus 2               788            0.294178
7                                           Rotavirus A               749            0.279618
8                                         Riboviria sp.               563            0.210181
9                         Sim

In [25]:
df[["virus_name", "virus_host_name", "uniref90_id"]].groupby(["virus_name", "virus_host_name"]).count().sort_values("uniref90_id", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,uniref90_id
virus_name,virus_host_name,Unnamed: 2_level_1
Human immunodeficiency virus 1,Homo sapiens,215406
Hepacivirus hominis,Homo sapiens,10926
Hepatitis B virus,Homo sapiens,10117
Human immunodeficiency virus,Homo sapiens,2095
Human papillomavirus,Homo sapiens,1078
Human immunodeficiency virus 2,Homo sapiens,788
Simian-Human immunodeficiency virus,Macaca mulatta,689
Rotavirus A,Homo sapiens,567
Marmot picobirnavirus,Marmota himalayana,529
Bacteriophage sp.,Homo sapiens,411


In [32]:
host_counts = df[["virus_host_name", "uniref90_id"]].groupby(["uniref90_id"]).count().sort_values("virus_host_name", ascending=False)

In [35]:
sum(host_counts["virus_host_name"] > 1)

5

In [36]:
host_counts[host_counts["virus_host_name"] > 1]

Unnamed: 0_level_0,virus_host_name
uniref90_id,Unnamed: 1_level_1
UniRef90_A0A059TZL9,2
UniRef90_A0A059TXS9,2
UniRef90_A0A7G0QVS7,2
UniRef90_A0A059TSQ3,2
UniRef90_A0A7G0QZ23,2


In [37]:
duplicate_uniref_ids = host_counts[host_counts["virus_host_name"] > 1].index

In [41]:
sum(df.duplicated())
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_duplicates.csv")
df[df.duplicated()].to_csv(file_path, index=False)

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv.csv

In [50]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv.csv")
df = pd.read_csv(file_path)
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")
df.shape

Number of unique values = 3772
                                             virus_name  virus_name_count  virus_name_percent
0                                   Hepacivirus hominis             10947           22.905507
1                                     Hepatitis B virus             10152           21.242049
2                                  Human papillomavirus              1078            2.255608
3                                           Rotavirus A               749            1.567208
4                                         Riboviria sp.               563            1.178021
5                                 Marmot picobirnavirus               529            1.106880
6                               Paslahepevirus balayani               431            0.901825
7                                     Bacteriophage sp.               411            0.859977
8                                 Hepatitis delta virus               400            0.836960
9                            

                                        virus_host_name  virus_host_name_count  virus_host_name_percent
0                                          Homo sapiens                  28782                60.223468
1                                            Sus scrofa                   1286                 2.690827
2                             Hydrochoerus hydrochaeris                    627                 1.311935
3                                    Marmota himalayana                    539                 1.127804
4                                         Gallus gallus                    484                 1.012722
5                                       Cyprinus carpio                    411                 0.859977
6                                    Petroica australis                    408                 0.853699
7                                           Felis catus                    298                 0.623535
8                                     Gopherus morafkai         

(47792, 11)

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.01_c5.csv.csv

In [52]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.01_c5.csv")
df = pd.read_csv(file_path)
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")

Number of unique values = 661
                                            virus_name  virus_name_count  virus_name_percent
0                                  Hepacivirus hominis             10926           34.447317
1                                    Hepatitis B virus             10120           31.906173
2                                 Human papillomavirus              1078            3.398701
3                                          Rotavirus A               629            1.983101
4                                Marmot picobirnavirus               529            1.667823
5                                    Bacteriophage sp.               411            1.295794
6                                Hepatitis delta virus               400            1.261114
7                              Paslahepevirus balayani               380            1.198058
8                                    Torque teno virus               363            1.144461
9                              Human bet

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.007_c7.csv

In [55]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.007_c7.csv")
df = pd.read_csv(file_path)
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")

Number of unique values = 763
                                            virus_name  virus_name_count  virus_name_percent
0                                  Hepacivirus hominis             10926           33.580232
1                                    Hepatitis B virus             10120           31.103052
2                                 Human papillomavirus              1078            3.313151
3                                          Rotavirus A               629            1.933184
4                                Marmot picobirnavirus               529            1.625841
5                                    Bacteriophage sp.               411            1.263177
6                                Hepatitis delta virus               400            1.229370
7                                     Carp edema virus               392            1.204782
8                              Paslahepevirus balayani               380            1.167901
9                                    Tor

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.005_c13.csv

In [54]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.005_c13.csv")
df = pd.read_csv(file_path)
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")

Number of unique values = 1032
                                             virus_name  virus_name_count  virus_name_percent
0                                   Hepacivirus hominis             10926           31.954843
1                                     Hepatitis B virus             10120           29.597567
2                                  Human papillomavirus              1078            3.152784
3                                           Rotavirus A               635            1.857160
4                                 Marmot picobirnavirus               529            1.547146
5                                     Bacteriophage sp.               411            1.202036
6                                 Hepatitis delta virus               400            1.169864
7                                      Carp edema virus               392            1.146467
8                               Paslahepevirus balayani               380            1.111371
9                            

### uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv.csv

In [48]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_idv.csv")
df = pd.read_csv(file_path)
column_stats(df, "virus_name")
column_stats(df, "virus_host_name")

Number of unique values = 7
                            virus_name  virus_name_count  virus_name_percent
0       Human immunodeficiency virus 1            215415           97.885654
1         Human immunodeficiency virus              2095            0.951978
2  Simian-Human immunodeficiency virus              1030            0.468037
3       Human immunodeficiency virus 2               788            0.358071
4        Simian immunodeficiency virus               538            0.244470
5        Feline immunodeficiency virus               198            0.089972
6        Bovine immunodeficiency virus                 4            0.001818
Number of unique values = 40
                       virus_host_name  virus_host_name_count  virus_host_name_percent
0                         Homo sapiens                 218631                99.347020
1                       Macaca mulatta                    849                 0.385790
2                          Felis catus                    143     