In [36]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from ast import literal_eval
import unicodedata
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)

In [15]:
def column_stats(df, id_column, name_column, tax_id_column, n=None):
    if n is None:
        n = df.shape[0]
    count_column = name_column + "_count"
    count_df = df[[name_column, tax_id_column, id_column]].groupby([name_column, tax_id_column]).count().reset_index()
    count_df.columns=[name_column, tax_id_column, count_column]
    count_df[name_column + "_percent"] = count_df[count_column].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[name_column].unique())}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [3]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

# Coronaviridae S Protein UniRef90 - EMBL mapping Dataset Generation


### coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv

In [10]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(860, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species


In [16]:
column_stats(df, "uniref90_id", "virus_host_name", "virus_host_tax_id")

Number of unique values = 155


Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_name_count,virus_host_name_percent
0,15 days old calf,0,1,0.116279
1,allactaga sibirica,234626,1,0.116279
2,anas gracilis,45630,1,0.116279
3,anas superciliosa,45632,3,0.348837
4,arenaria interpres (ruddy turnstone),0,1,0.116279
5,backyard chicken,0,1,0.116279
6,bandicota indica,456472,1,0.116279
7,bandicota savilei,456473,1,0.116279
8,bat,0,12,1.395349
9,bat bf_258i,0,1,0.116279


### Manual cleanup of 
coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv 

**using**

coronaviridae_s_uniref90_embl_hosts_pruned_metadata_manual_mapping.csv

In [10]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(860, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species


In [39]:
mapping_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/manual_mapping_files/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_manual_mapping.csv")
mapping_df = pd.read_csv(mapping_file_path, encoding="unicode_escape")
mapping_df["mapped_virus_host_name"] = mapping_df["mapped_virus_host_name"].apply(lambda x: unicodedata.normalize("NFKD", x))
print(mapping_df.shape)
mapping_df.head()

(54, 3)


Unnamed: 0,virus_host_name,mapped_virus_host_name,mapped_virus_host_tax_id
0,arenaria interpres (ruddy turnstone),arenaria interpres,54971
1,backyard chicken,gallus gallus,9031
2,bovine,bos taurus,9913
3,broiler,gallus gallus,9031
4,broiler chicken,gallus gallus,9031


In [40]:
name_name_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_name"].to_dict()
name_name_map

{'arenaria interpres (ruddy turnstone)': 'arenaria interpres',
 'backyard chicken': 'gallus gallus',
 'bovine': 'bos taurus',
 'broiler': 'gallus gallus',
 'broiler chicken': 'gallus gallus',
 'broiler chickens': 'gallus gallus',
 'cardioderma cor (bat)': 'cardioderma cor',
 'cat': 'felis catus',
 'cattle': 'bos taurus',
 'chaerephon pumilus (bat)': 'mops pumilus',
 'chaerephon sp. (bat)': 'chaerephon sp.',
 'chicken': 'gallus gallus',
 'chicken (broiler)': 'gallus gallus',
 'chicken-broiler': 'gallus gallus',
 'chlamydotis sp. (houbara)': 'chlamydotis undulata',
 'common moorhen': 'gallinula chloropus',
 'delphinapterus leucas (beluga whale)': 'delphinapterus leucas',
 'dog': 'canis lupus',
 'eidolon helvum (bat)': 'eidolon helvum',
 'eptesicus fuscus (big brown bat)': 'eptesicus fuscus',
 'felis catus (cat)': 'felis catus',
 'ferret': 'mustela putorius',
 'galliformes sp. (quail)': 'galliformes sp.',
 'gallus domesticus': 'gallus gallus',
 'gallus gallus (48 week-old breeder layers)'

In [41]:
name_id_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_tax_id"].to_dict()
name_id_map

{'arenaria interpres (ruddy turnstone)': 54971,
 'backyard chicken': 9031,
 'bovine': 9913,
 'broiler': 9031,
 'broiler chicken': 9031,
 'broiler chickens': 9031,
 'cardioderma cor (bat)': 270764,
 'cat': 9685,
 'cattle': 9913,
 'chaerephon pumilus (bat)': 242384,
 'chaerephon sp. (bat)': 3075048,
 'chicken': 9031,
 'chicken (broiler)': 9031,
 'chicken-broiler': 9031,
 'chlamydotis sp. (houbara)': 172680,
 'common moorhen': 9123,
 'delphinapterus leucas (beluga whale)': 9749,
 'dog': 9612,
 'eidolon helvum (bat)': 77214,
 'eptesicus fuscus (big brown bat)': 29078,
 'felis catus (cat)': 9685,
 'ferret': 9668,
 'galliformes sp. (quail)': 1857023,
 'gallus domesticus': 9031,
 'gallus gallus (48 week-old breeder layers)': 9031,
 'gallus gallus (broiler)': 9031,
 'gallus gallus (chicken)': 9031,
 'gallus gallus domesticus (chicken)': 9031,
 'goat': 9925,
 'guinea fowl': 8997,
 'homo sapiens sample 4310600038': 9606,
 'horse': 9796,
 'hydropotes inermis (water deer)': 9883,
 'kennelled dog':

In [54]:
name_keys = list(name_name_map.keys())
name_keys

['arenaria interpres (ruddy turnstone)',
 'backyard chicken',
 'bovine',
 'broiler',
 'broiler chicken',
 'broiler chickens',
 'cardioderma cor (bat)',
 'cat',
 'cattle',
 'chaerephon pumilus (bat)',
 'chaerephon sp. (bat)',
 'chicken',
 'chicken (broiler)',
 'chicken-broiler',
 'chlamydotis sp. (houbara)',
 'common moorhen',
 'delphinapterus leucas (beluga whale)',
 'dog',
 'eidolon helvum (bat)',
 'eptesicus fuscus (big brown bat)',
 'felis catus (cat)',
 'ferret',
 'galliformes sp. (quail)',
 'gallus domesticus',
 'gallus gallus (48 week-old breeder layers)',
 'gallus gallus (broiler)',
 'gallus gallus (chicken)',
 'gallus gallus domesticus (chicken)',
 'goat',
 'guinea fowl',
 'homo sapiens sample 4310600038',
 'horse',
 'hydropotes inermis (water deer)',
 'kennelled dog',
 'layer chicken',
 'manis javanica (malayan pangolin)',
 'miniopterus fuliginosus, feces',
 'miniopterus fuliginosus, intestine',
 'miniopterus inflatus (bat)',
 'miniopterus natalensis (bat)',
 'miniopterus spp.

### Update order: rank, tax_id, name (order is important)
#### 1. Update virus_host_taxon_rank

In [58]:
df["virus_host_taxon_rank"] = df.apply(lambda x: "species" if x["virus_host_name"] in name_keys else x["virus_host_taxon_rank"], axis=1)

#### 2. Update virus_host_tax_id

In [59]:
df["virus_host_tax_id"] = df.apply(lambda x: name_id_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_tax_id"], axis=1)

In [63]:
df[df["virus_host_name"].isin(name_keys)][["virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
9,chicken,9031,species
10,dog,9612,species
11,chicken,9031,species
12,chicken,9031,species
13,dog,9612,species
14,delphinapterus leucas (beluga whale),9749,species
15,chicken,9031,species
20,red-whiskered bulbul,182897,species
23,broiler chicken,9031,species
24,white-rumped munia,40157,species


#### 3. Update virus_host_name

In [64]:
df["virus_host_name"] = df.apply(lambda x: name_name_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_name"], axis=1)

In [66]:
df[["embl_host_name", "virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,embl_host_name,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
0,['Homo sapiens'],homo sapiens,9606,species
1,['Homo sapiens'],homo sapiens,9606,species
2,['Homo sapiens'],homo sapiens,9606,species
3,['Homo sapiens'],homo sapiens,9606,species
4,['Homo sapiens'],homo sapiens,9606,species
5,['Homo sapiens'],homo sapiens,9606,species
6,['Homo sapiens'],homo sapiens,9606,species
7,['Homo sapiens'],homo sapiens,9606,species
8,['quail'],quail,0,
9,['chicken'],gallus gallus,9031,species


**Sequence Length Parameters**

In [73]:
np.percentile(non_idv_df["seq_len"].values, [90, 95, 99])

array([ 623.9 , 1035.35, 3011.  ])

**Number of host classes**

In [87]:
# 1% cutoff
filtered_df = filter_with_threshold(non_idv_df, "virus_host_name", 1)

Size of df = (47792, 11)
Number of unique values = 1304
                                        virus_host_name  virus_host_name_count  virus_host_name_percent
0                                          Homo sapiens                  28782                60.223468
1                                            Sus scrofa                   1286                 2.690827
2                             Hydrochoerus hydrochaeris                    627                 1.311935
3                                    Marmota himalayana                    539                 1.127804
4                                         Gallus gallus                    484                 1.012722
5                                       Cyprinus carpio                    411                 0.859977
6                                    Petroica australis                    408                 0.853699
7                                           Felis catus                    298                 0.623535
8       

In [88]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.01_c5.csv")
filtered_df.to_csv(output_file_path, index=False)

In [89]:
# 0.7% cutoff
filtered_df = filter_with_threshold(non_idv_df, "virus_host_name", 0.7)

Size of df = (47792, 11)
Number of unique values = 1304
                                        virus_host_name  virus_host_name_count  virus_host_name_percent
0                                          Homo sapiens                  28782                60.223468
1                                            Sus scrofa                   1286                 2.690827
2                             Hydrochoerus hydrochaeris                    627                 1.311935
3                                    Marmota himalayana                    539                 1.127804
4                                         Gallus gallus                    484                 1.012722
5                                       Cyprinus carpio                    411                 0.859977
6                                    Petroica australis                    408                 0.853699
7                                           Felis catus                    298                 0.623535
8       

In [90]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.007_c7.csv")
filtered_df.to_csv(output_file_path, index=False)

In [91]:
# 0.5% cutoff
filtered_df = filter_with_threshold(non_idv_df, "virus_host_name", 0.5)

Size of df = (47792, 11)
Number of unique values = 1304
                                        virus_host_name  virus_host_name_count  virus_host_name_percent
0                                          Homo sapiens                  28782                60.223468
1                                            Sus scrofa                   1286                 2.690827
2                             Hydrochoerus hydrochaeris                    627                 1.311935
3                                    Marmota himalayana                    539                 1.127804
4                                         Gallus gallus                    484                 1.012722
5                                       Cyprinus carpio                    411                 0.859977
6                                    Petroica australis                    408                 0.853699
7                                           Felis catus                    298                 0.623535
8       

In [92]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/uniref90/20240131/uniref90_viridae_embl_hosts_pruned_metadata_species_vertebrates_w_seq_non_idv_t0.005_c13.csv")
filtered_df.to_csv(output_file_path, index=False)