# Coronaviridae S Protein - EMBL mapping Dataset Analysis


In [23]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)
from ast import literal_eval

UNIREF90_ID = "uniref90_id"
TAX_ID = "tax_id"
SEQUENCE = "seq"
HOST_TAX_IDS = "host_tax_ids"
UNIPROT_HOST_TAX_IDS ="uniprot_host_tax_ids"
EMBL_REF_ID = "embl_ref_id"
EMBL_HOST_NAME ="embl_host_name"

In [24]:
def column_stats(df, column_name):
    n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [4]:
def analyze_df(file_path):
    df = pd.read_csv(file_path)
#    df["seq_len"] = df["seq"].apply(lambda x: len(x))
    print("df size = ", df.shape)
    print(df.head())
    column_stats(df, "tax_id")
    column_stats(df, "embl_host_name")
    return df

## UniRef90 sequences

coronaviridae_s_uniref90_embl_hosts.csv

coronaviridae_s_uniref90_embl_hosts_pruned.csv

coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv

coronaviridae_s_uniref90_embl_hosts_pruned_metadata_virus_host_genus.csv

coronaviridae_s_uniref90_embl_hosts_pruned_metadata_virus_host_genus_vertebrates.csv

coronaviridae_s_uniref90_embl_hosts_pruned_metadata_virus_host_genus_vertebrates_w_seq.csv

In [22]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_virus_host_genus_vertebrates.csv")
# df = pd.read_csv(file_path, on_bad_lines=None, converters={2: literal_eval}, names=[UNIREF90_ID, TAX_ID, HOST_TAX_IDS, EMBL_REF_ID])
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(482, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus


In [20]:
sum(df["virus_host_name"] != "None")

860

In [21]:
column_stats(df, "virus_host_name")

Number of unique values = 128
                                virus_host_name  virus_host_name_count  virus_host_name_percent
0                                       chicken                    172                20.000000
1                                          Homo                    162                18.837209
2                                        Gallus                    100                11.627907
3                                         Felis                     48                 5.581395
4                                   Rhinolophus                     39                 4.534884
5                                       porcine                     18                 2.093023
6                                         Canis                     18                 2.093023
7                                           cat                     17                 1.976744
8                                           pig                     16                 1.860465
9         

In [18]:
column_stats(df, "embl_host_name")

Number of unique values = 169
                                        embl_host_name  embl_host_name_count  embl_host_name_percent
0                                          ['chicken']                   167               19.418605
1                                     ['Homo sapiens']                   155               18.023256
2                                    ['Gallus gallus']                    97               11.279070
3                                      ['Felis catus']                    48                5.581395
4                                          ['porcine']                    18                2.093023
5                                              ['cat']                    17                1.976744
6                           ['Canis lupus familiaris']                    17                1.976744
7                                              ['pig']                    16                1.860465
8                         ['Gallus gallus domesticus']       

### Records with duplicate EMBL ids

In [7]:
sum(df[EMBL_REF_ID].value_counts() > 1)

1

In [8]:
embl_ref_ids_count = df[EMBL_REF_ID].value_counts()

In [9]:
embl_ref_ids_count[embl_ref_ids_count > 1]

None    90
Name: embl_ref_id, dtype: int64

In [10]:
duplicate_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
duplicate_embl_ids.pop("None")
duplicate_embl_ids

Series([], Name: embl_ref_id, dtype: int64)

In [11]:
df_duplicate_embl = df[df[EMBL_REF_ID].isin(duplicate_embl_ids.index)]

In [12]:
df_duplicate_embl.shape

(0, 4)

In [13]:
df_duplicate_embl

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id


### Records with valid EMBL id

In [14]:
invalid_embl_ids = embl_ref_ids_count[embl_ref_ids_count > 1]
embl_pruned_df = df[~df[EMBL_REF_ID].isin(invalid_embl_ids.index)]

In [15]:
embl_pruned_df.shape

(1349, 4)

## EMBL Mapping 
### coronaviridae_s_uniref90_embl_hosts.csv

In [16]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts.csv")
df = analyze_df(file_path)

df size =  (1439, 5)
           uniref90_id  tax_id host_tax_ids embl_ref_id embl_host_name
0  UniRef90_A0A023PUW9  694009          NaN  AHX37558.1            NaN
1  UniRef90_A0A8F1BB48   11120          NaN  QWM97422.1            NaN
2  UniRef90_A0A8F1BB52   11120          NaN  QWM97437.1            NaN
3  UniRef90_A0A023VWC7   11120          NaN  AHY19803.1            NaN
4  UniRef90_A0A8F1BBN9   11120          NaN  QWM97409.1            NaN
Number of unique values = 176
      tax_id  tax_id_count  tax_id_percent
0      11120           507       35.232801
1    2697049           126        8.756081
2      12663           104        7.227241
3      31631            99        6.879778
4     694014            88        6.115358
5      28295            53        3.683113
6      11118            35        2.432245
7      11153            31        2.154274
8    2501931            24        1.667825
9     693997            20        1.389854
10   1508220            15        1.042391
11     

In [17]:
df.head()

Unnamed: 0,uniref90_id,tax_id,host_tax_ids,embl_ref_id,embl_host_name
0,UniRef90_A0A023PUW9,694009,,AHX37558.1,
1,UniRef90_A0A8F1BB48,11120,,QWM97422.1,
2,UniRef90_A0A8F1BB52,11120,,QWM97437.1,
3,UniRef90_A0A023VWC7,11120,,AHY19803.1,
4,UniRef90_A0A8F1BBN9,11120,,QWM97409.1,


In [29]:
print(f"Total sequences = {df.shape[0]}")
print(f"Sequences without Uniprot hosts = {sum(df['host_tax_ids'].isna())}")
print(f"Sequences without EMBL hosts = {sum(df['embl_host_name'].isna())}")
print(f"Sequences with EMBL hosts, without UniProt hosts = {df[df['host_tax_ids'].isna() & ~df['embl_host_name'].isna()].shape[0]}")
print(f"Sequences with Uniprot hosts, without EMBL hosts = {df[~df['host_tax_ids'].isna() & df['embl_host_name'].isna()].shape[0]}")
print(f"Sequences without Uniprot hosts, without EMBL hosts = {df[df['host_tax_ids'].isna() & df['embl_host_name'].isna()].shape[0]}")

Total sequences = 1439
Sequences without Uniprot hosts = 1171
Sequences without EMBL hosts = 579
Sequences with EMBL hosts, without UniProt hosts = 683
Sequences with Uniprot hosts, without EMBL hosts = 91
Sequences without Uniprot hosts, without EMBL hosts = 488


### coronaviridae_s_uniref90_embl_hosts_pruned.csv

In [30]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned.csv")
df = analyze_df(file_path)

df size =  (860, 5)
           uniref90_id  tax_id uniprot_host_tax_ids embl_ref_id    embl_host_name
0  UniRef90_A0A0H3VFW7   31631               [9606]  AKA09662.1  ['Homo sapiens']
1  UniRef90_A0A0K0KPB6   31631               [9606]  AIL49495.1  ['Homo sapiens']
2  UniRef90_A0A0K0KPM4   31631               [9606]  AIL49520.1  ['Homo sapiens']
3  UniRef90_A0A0K0KPR8   31631               [9606]  AIL49509.1  ['Homo sapiens']
4  UniRef90_A0A0K0L8I8   31631               [9606]  AIV41891.1  ['Homo sapiens']
Number of unique values = 114
      tax_id  tax_id_count  tax_id_percent
0      11120           276       32.093023
1    2697049            93       10.813953
2      12663            70        8.139535
3     694014            64        7.441860
4      31631            55        6.395349
5      28295            47        5.465116
6      11118            22        2.558140
7      11153            20        2.325581
8    2501931            16        1.860465
9     693997            16  

In [33]:
df.head()

Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens']
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens']
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens']
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens']
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens']


### coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv

In [5]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_virus_host_genus.csv")
df = analyze_df(file_path)

df size =  (860, 10)
           uniref90_id  tax_id uniprot_host_tax_ids embl_ref_id    embl_host_name virus_host_name              virus_name virus_taxon_rank  virus_host_tax_id virus_host_taxon_rank
0  UniRef90_A0A0H3VFW7   31631               [9606]  AKA09662.1  ['Homo sapiens']            Homo  Human coronavirus OC43          no rank               9605                 genus
1  UniRef90_A0A0K0KPB6   31631               [9606]  AIL49495.1  ['Homo sapiens']            Homo  Human coronavirus OC43          no rank               9605                 genus
2  UniRef90_A0A0K0KPM4   31631               [9606]  AIL49520.1  ['Homo sapiens']            Homo  Human coronavirus OC43          no rank               9605                 genus
3  UniRef90_A0A0K0KPR8   31631               [9606]  AIL49509.1  ['Homo sapiens']            Homo  Human coronavirus OC43          no rank               9605                 genus
4  UniRef90_A0A0K0L8I8   31631               [9606]  AIV41891.1  ['Homo sapiens

In [6]:
column_stats(df, "virus_host_name")

Number of unique values = 128
                                virus_host_name  virus_host_name_count  virus_host_name_percent
0                                       chicken                    172                20.000000
1                                          Homo                    162                18.837209
2                                        Gallus                    100                11.627907
3                                         Felis                     48                 5.581395
4                                   Rhinolophus                     39                 4.534884
5                                       porcine                     18                 2.093023
6                                         Canis                     18                 2.093023
7                                           cat                     17                 1.976744
8                                           pig                     16                 1.860465
9         

In [7]:
df[["uniref90_id", "virus_name", "virus_taxon_rank"]].groupby(["virus_name", "virus_taxon_rank"]).count().sort_values(["virus_taxon_rank", "uniref90_id"], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,uniref90_id
virus_name,virus_taxon_rank,Unnamed: 2_level_1
Nyctacovirus,subgenus,1
Sarbecovirus,subgenus,1
Tegacovirus,subgenus,1
Orthocoronavirinae,subfamily,16
Avian coronavirus,species,64
Porcine epidemic diarrhea virus,species,47
Alphacoronavirus 1,species,16
Bat coronavirus,species,15
Middle East respiratory syndrome-related coronavirus,species,8
Swine acute diarrhea syndrome related coronavirus,species,8


In [8]:
df.head()

Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],Homo,Human coronavirus OC43,no rank,9605,genus


In [9]:
df[["uniref90_id", "virus_host_name", "virus_host_taxon_rank"]].groupby(["virus_host_name", "virus_host_taxon_rank"]).count().sort_values(["virus_host_taxon_rank", "uniref90_id"], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,uniref90_id
virus_host_name,virus_host_taxon_rank,Unnamed: 2_level_1
Canis familiaris,subspecies,8
Gallus gallus domesticus,species,14
Allactaga sibirica,species,1
Hipposideros commersoni,species,1
Hipposideros vittatus,species,1
Rhinolophus Ferrumequinum,species,1
Rousettus leschenaulti,species,1
grey-backed thrush,species,1
Homo,genus,162
Gallus,genus,100


### coronaviridae_s_uniref90_embl_hosts_pruned_metadata_species.csv

In [31]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_species.csv")
df = analyze_df(file_path)

df size =  (124, 9)
           uniref90_id  tax_id embl_ref_id     embl_host_name virus_host_name         virus_name virus_taxon_rank  virus_host_tax_id virus_host_taxon_rank
0  UniRef90_A0A0K0QRF0  694014  AKR15707.1  ['Gallus gallus']   Gallus gallus  Avian coronavirus          species               9031               species
1  UniRef90_A0A0K0QRF5  694014  AKR15706.1  ['Gallus gallus']   Gallus gallus  Avian coronavirus          species               9031               species
2  UniRef90_A0A0K0QRL7  694014  AKR15708.1  ['Gallus gallus']   Gallus gallus  Avian coronavirus          species               9031               species
3  UniRef90_A0A0K0QSF9  694014  AKR15709.1  ['Gallus gallus']   Gallus gallus  Avian coronavirus          species               9031               species
4      UniRef90_C0LZT6  694014  ACN87331.1  ['Gallus gallus']   Gallus gallus  Avian coronavirus          species               9031               species
Number of unique values = 44
     tax_id  tax_id_c

In [32]:
df.shape

(124, 9)