In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from ast import literal_eval
import unicodedata
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)

In [3]:
def column_stats_new(df, id_column, name_column, tax_id_column, n=None):
    if n is None:
        n = df.shape[0]
    count_column = name_column + "_count"
    count_df = df[[name_column, tax_id_column, id_column]].groupby([name_column, tax_id_column]).count().reset_index()
    count_df.columns=[name_column, tax_id_column, count_column]
    count_df[name_column + "_percent"] = count_df[count_column].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[name_column].unique())}")
    return count_df

In [4]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [5]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

# Coronaviridae S Protein UniRef90 - EMBL mapping Dataset Generation
## Coronaviridae S Protein

### coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv

In [6]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(860, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species


In [7]:
column_stats(df, "uniref90_id", "virus_host_name", "virus_host_tax_id")

Number of unique values = 153


Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_name_count,virus_host_name_percent
0,15 days old calf,0,1,0.116279
1,allactaga sibirica,234626,1,0.116279
2,anas gracilis,45630,1,0.116279
3,anas superciliosa,45632,3,0.348837
4,arenaria interpres (ruddy turnstone),0,1,0.116279
5,backyard chicken,0,1,0.116279
6,bandicota indica,456472,1,0.116279
7,bandicota savilei,456473,1,0.116279
8,bat,0,12,1.395349
9,bat bf_258i,0,1,0.116279


### Manual cleanup of 
coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv 

**using**

coronaviridae_s_uniref90_embl_hosts_pruned_metadata_manual_mapping.csv

In [8]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(860, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species


In [9]:
mapping_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/manual_mapping_files/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_manual_mapping.csv")
mapping_df = pd.read_csv(mapping_file_path, encoding="unicode_escape")
mapping_df["mapped_virus_host_name"] = mapping_df["mapped_virus_host_name"].apply(lambda x: unicodedata.normalize("NFKD", x))
print(mapping_df.shape)
mapping_df.head()

(54, 3)


Unnamed: 0,virus_host_name,mapped_virus_host_name,mapped_virus_host_tax_id
0,arenaria interpres (ruddy turnstone),arenaria interpres,54971
1,backyard chicken,gallus gallus,9031
2,bovine,bos taurus,9913
3,broiler,gallus gallus,9031
4,broiler chicken,gallus gallus,9031


In [10]:
name_name_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_name"].to_dict()
name_name_map

{'arenaria interpres (ruddy turnstone)': 'arenaria interpres',
 'backyard chicken': 'gallus gallus',
 'bovine': 'bos taurus',
 'broiler': 'gallus gallus',
 'broiler chicken': 'gallus gallus',
 'broiler chickens': 'gallus gallus',
 'cardioderma cor (bat)': 'cardioderma cor',
 'cat': 'felis catus',
 'cattle': 'bos taurus',
 'chaerephon pumilus (bat)': 'mops pumilus',
 'chaerephon sp. (bat)': 'chaerephon sp.',
 'chicken': 'gallus gallus',
 'chicken (broiler)': 'gallus gallus',
 'chicken-broiler': 'gallus gallus',
 'chlamydotis sp. (houbara)': 'chlamydotis undulata',
 'common moorhen': 'gallinula chloropus',
 'delphinapterus leucas (beluga whale)': 'delphinapterus leucas',
 'dog': 'canis lupus',
 'eidolon helvum (bat)': 'eidolon helvum',
 'eptesicus fuscus (big brown bat)': 'eptesicus fuscus',
 'felis catus (cat)': 'felis catus',
 'ferret': 'mustela putorius',
 'galliformes sp. (quail)': 'galliformes sp.',
 'gallus domesticus': 'gallus gallus',
 'gallus gallus (48 week-old breeder layers)'

In [11]:
name_id_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_tax_id"].to_dict()
name_id_map

{'arenaria interpres (ruddy turnstone)': 54971,
 'backyard chicken': 9031,
 'bovine': 9913,
 'broiler': 9031,
 'broiler chicken': 9031,
 'broiler chickens': 9031,
 'cardioderma cor (bat)': 270764,
 'cat': 9685,
 'cattle': 9913,
 'chaerephon pumilus (bat)': 242384,
 'chaerephon sp. (bat)': 3075048,
 'chicken': 9031,
 'chicken (broiler)': 9031,
 'chicken-broiler': 9031,
 'chlamydotis sp. (houbara)': 172680,
 'common moorhen': 9123,
 'delphinapterus leucas (beluga whale)': 9749,
 'dog': 9612,
 'eidolon helvum (bat)': 77214,
 'eptesicus fuscus (big brown bat)': 29078,
 'felis catus (cat)': 9685,
 'ferret': 9668,
 'galliformes sp. (quail)': 1857023,
 'gallus domesticus': 9031,
 'gallus gallus (48 week-old breeder layers)': 9031,
 'gallus gallus (broiler)': 9031,
 'gallus gallus (chicken)': 9031,
 'gallus gallus domesticus (chicken)': 9031,
 'goat': 9925,
 'guinea fowl': 8997,
 'homo sapiens sample 4310600038': 9606,
 'horse': 9796,
 'hydropotes inermis (water deer)': 9883,
 'kennelled dog':

In [12]:
name_keys = list(name_name_map.keys())
name_keys

['arenaria interpres (ruddy turnstone)',
 'backyard chicken',
 'bovine',
 'broiler',
 'broiler chicken',
 'broiler chickens',
 'cardioderma cor (bat)',
 'cat',
 'cattle',
 'chaerephon pumilus (bat)',
 'chaerephon sp. (bat)',
 'chicken',
 'chicken (broiler)',
 'chicken-broiler',
 'chlamydotis sp. (houbara)',
 'common moorhen',
 'delphinapterus leucas (beluga whale)',
 'dog',
 'eidolon helvum (bat)',
 'eptesicus fuscus (big brown bat)',
 'felis catus (cat)',
 'ferret',
 'galliformes sp. (quail)',
 'gallus domesticus',
 'gallus gallus (48 week-old breeder layers)',
 'gallus gallus (broiler)',
 'gallus gallus (chicken)',
 'gallus gallus domesticus (chicken)',
 'goat',
 'guinea fowl',
 'homo sapiens sample 4310600038',
 'horse',
 'hydropotes inermis (water deer)',
 'kennelled dog',
 'layer chicken',
 'manis javanica (malayan pangolin)',
 'miniopterus fuliginosus, feces',
 'miniopterus fuliginosus, intestine',
 'miniopterus inflatus (bat)',
 'miniopterus natalensis (bat)',
 'miniopterus spp.

### Update order: rank, tax_id, name (order is important)
#### 1. Update virus_host_taxon_rank

In [13]:
df["virus_host_taxon_rank"] = df.apply(lambda x: "species" if x["virus_host_name"] in name_keys else x["virus_host_taxon_rank"], axis=1)

#### 2. Update virus_host_tax_id

In [14]:
df["virus_host_tax_id"] = df.apply(lambda x: name_id_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_tax_id"], axis=1)

In [15]:
df[df["virus_host_name"].isin(name_keys)][["virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
9,chicken,9031,species
10,dog,9612,species
11,chicken,9031,species
12,chicken,9031,species
13,dog,9612,species
14,delphinapterus leucas (beluga whale),9749,species
15,chicken,9031,species
20,red-whiskered bulbul,182897,species
23,broiler chicken,9031,species
24,white-rumped munia,40157,species


#### 3. Update virus_host_name

In [16]:
df["virus_host_name"] = df.apply(lambda x: name_name_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_name"], axis=1)

In [17]:
df[["embl_host_name", "virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,embl_host_name,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
0,['Homo sapiens'],homo sapiens,9606,species
1,['Homo sapiens'],homo sapiens,9606,species
2,['Homo sapiens'],homo sapiens,9606,species
3,['Homo sapiens'],homo sapiens,9606,species
4,['Homo sapiens'],homo sapiens,9606,species
5,['Homo sapiens'],homo sapiens,9606,species
6,['Homo sapiens'],homo sapiens,9606,species
7,['Homo sapiens'],homo sapiens,9606,species
8,['quail'],quail,0,
9,['chicken'],gallus gallus,9031,species


In [19]:
print(df.shape)
df.head()

(860, 10)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species


In [20]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected.csv")
df.to_csv(file_path, index=False)

### coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq.csv

Filter based on host prevalence

In [36]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(814, 11)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,AFAVIGDLKCPLDTSRKGSFNNKDTGPPFISTDTVDVTNGLGTYYV...
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNDRDTGPPSISTDTV...
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPRLKGSFNDRDTGPPSISTDTV...
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNNRDTGPPSISTDTV...
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDTRLKGSFNNRDTGPPSISTDTV...


In [37]:
column_stats(df, "virus_host_name")

Number of unique values = 98
              virus_host_name  virus_host_name_count  virus_host_name_percent
0               gallus gallus                    332                40.786241
1                homo sapiens                    163                20.024570
2                 felis catus                     68                 8.353808
3                  sus scrofa                     54                 6.633907
4                 canis lupus                     33                 4.054054
5             rhinolophus sp.                     11                 1.351351
6            mustela putorius                     11                 1.351351
7         rhinolophus sinicus                      9                 1.105651
8      rhinolophus acuminatus                      5                 0.614251
9              eidolon helvum                      4                 0.491400
10            pan troglodytes                      4                 0.491400
11  rhinolophus ferrumequinum      

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,gallus gallus,332,40.786241
1,homo sapiens,163,20.02457
2,felis catus,68,8.353808
3,sus scrofa,54,6.633907
4,canis lupus,33,4.054054
5,rhinolophus sp.,11,1.351351
6,mustela putorius,11,1.351351
7,rhinolophus sinicus,9,1.105651
8,rhinolophus acuminatus,5,0.614251
9,eidolon helvum,4,0.4914


**Number of host classes**

In [38]:
# 1% cutoff
filtered_df = filter_with_threshold(df, "virus_host_name", 1)

Size of df = (814, 11)
Number of unique values = 98
              virus_host_name  virus_host_name_count  virus_host_name_percent
0               gallus gallus                    332                40.786241
1                homo sapiens                    163                20.024570
2                 felis catus                     68                 8.353808
3                  sus scrofa                     54                 6.633907
4                 canis lupus                     33                 4.054054
5             rhinolophus sp.                     11                 1.351351
6            mustela putorius                     11                 1.351351
7         rhinolophus sinicus                      9                 1.105651
8      rhinolophus acuminatus                      5                 0.614251
9              eidolon helvum                      4                 0.491400
10            pan troglodytes                      4                 0.491400
11  rhinolop

In [13]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv")
filtered_df["human_binary_label"] = filtered_df.apply(lambda x: "homo sapiens" if x["virus_host_name"] == "homo sapiens" else "NOT homo sapiens", axis=1)
filtered_df.to_csv(output_file_path, index=False)

## Coronaviridae - S protein - Human infecting

In [15]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(681, 12)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq,human_binary_label
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,AFAVIGDLKCPLDTSRKGSFNNKDTGPPFISTDTVDVTNGLGTYYV...,homo sapiens
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNDRDTGPPSISTDTV...,homo sapiens
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPRLKGSFNDRDTGPPSISTDTV...,homo sapiens
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNNRDTGPPSISTDTV...,homo sapiens
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDTRLKGSFNNRDTGPPSISTDTV...,homo sapiens


In [16]:
column_stats(df, "virus_name")

Number of unique values = 37
                                           virus_name  virus_name_count  virus_name_percent
0                         Infectious bronchitis virus               271           39.794420
1     Severe acute respiratory syndrome coronavirus 2                93           13.656388
2                                  Feline coronavirus                63            9.251101
3                                   Avian coronavirus                60            8.810573
4                              Human coronavirus OC43                51            7.488987
5                     Porcine epidemic diarrhea virus                47            6.901615
6                                  Canine coronavirus                17            2.496329
7                                  Alphacoronavirus 1                15            2.202643
8   Swine acute diarrhea syndrome related coronavirus                 8            1.174743
9                                       Coronavirid

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Infectious bronchitis virus,271,39.79442
1,Severe acute respiratory syndrome coronavirus 2,93,13.656388
2,Feline coronavirus,63,9.251101
3,Avian coronavirus,60,8.810573
4,Human coronavirus OC43,51,7.488987
5,Porcine epidemic diarrhea virus,47,6.901615
6,Canine coronavirus,17,2.496329
7,Alphacoronavirus 1,15,2.202643
8,Swine acute diarrhea syndrome related coronavirus,8,1.174743
9,Coronaviridae,8,1.174743


In [18]:
human_df = df[df["virus_host_name"] == "homo sapiens"]
print(human_df.shape)
human_df.head()

(163, 12)


Unnamed: 0,uniref90_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq,human_binary_label
0,UniRef90_A0A0H3VFW7,31631,[9606],AKA09662.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,AFAVIGDLKCPLDTSRKGSFNNKDTGPPFISTDTVDVTNGLGTYYV...,homo sapiens
1,UniRef90_A0A0K0KPB6,31631,[9606],AIL49495.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNDRDTGPPSISTDTV...,homo sapiens
2,UniRef90_A0A0K0KPM4,31631,[9606],AIL49520.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPRLKGSFNDRDTGPPSISTDTV...,homo sapiens
3,UniRef90_A0A0K0KPR8,31631,[9606],AIL49509.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDPKLKGSFNNRDTGPPSISTDTV...,homo sapiens
4,UniRef90_A0A0K0L8I8,31631,[9606],AIV41891.1,['Homo sapiens'],homo sapiens,Human coronavirus OC43,no rank,9606,species,MFLILLISLPTAFAVIGDLNCPLDTRLKGSFNNRDTGPPSISTDTV...,homo sapiens


In [19]:
column_stats(human_df, "virus_name")

Number of unique values = 10
                                          virus_name  virus_name_count  virus_name_percent
0    Severe acute respiratory syndrome coronavirus 2                93           57.055215
1                             Human coronavirus OC43                51           31.288344
2                             Human coronavirus HKU1                 5            3.067485
3                             Human coronavirus 229E                 3            1.840491
4  Middle East respiratory syndrome-related coron...                 3            1.840491
5                                      Coronaviridae                 3            1.840491
6                             Human coronavirus NL63                 2            1.226994
7                     unidentified human coronavirus                 1            0.613497
8                                  Betacoronavirus 1                 1            0.613497
9                Human coronavirus Feline-like Hu142         

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Severe acute respiratory syndrome coronavirus 2,93,57.055215
1,Human coronavirus OC43,51,31.288344
2,Human coronavirus HKU1,5,3.067485
3,Human coronavirus 229E,3,1.840491
4,Middle East respiratory syndrome-related coron...,3,1.840491
5,Coronaviridae,3,1.840491
6,Human coronavirus NL63,2,1.226994
7,unidentified human coronavirus,1,0.613497
8,Betacoronavirus 1,1,0.613497
9,Human coronavirus Feline-like Hu142,1,0.613497


In [20]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/human/coronaviridae_s_uniref90_human.csv")
human_df.to_csv(file_path, index=False)

In [21]:
df["human_binary_label"].value_counts()

NOT homo sapiens    518
homo sapiens        163
Name: human_binary_label, dtype: int64