In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from ast import literal_eval
import unicodedata
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)

In [2]:
def column_stats_new(df, id_column, name_column, tax_id_column, n=None):
    if n is None:
        n = df.shape[0]
    count_column = name_column + "_count"
    count_df = df[[name_column, tax_id_column, id_column]].groupby([name_column, tax_id_column]).count().reset_index()
    count_df.columns=[name_column, tax_id_column, count_column]
    count_df[name_column + "_percent"] = count_df[count_column].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[name_column].unique())}")
    return count_df

In [3]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df
    
def filter_with_threshold(df, column_name, threshold):
    print(f"Size of df = {df.shape}")
    n = df.shape[0]
    count_df = column_stats(df, column_name, n=n)
    percent_column_name = column_name + "_percent"
    filtered_count_df = count_df[count_df[percent_column_name] >= threshold]
    filtered_df = df[df[column_name].isin(list(filtered_count_df[column_name].values))]
    print(f"Size of filtered df = {filtered_df.shape}")
    column_stats(filtered_df, column_name, n=n)
    return filtered_df

In [4]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

# Coronaviridae S Protein UniProt - EMBL mapping Dataset Generation
## Coronaviridae S Protein

### coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata.csv

In [31]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(3925, 10)


Unnamed: 0,uniprot_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,A0A6G8HJ43,28295,,QIM40843.1,['pig'],pig,Porcine epidemic diarrhea virus,species,0,
1,A0A0G3FCN8,28295,,AKJ85723.1,['swine'],swine,Porcine epidemic diarrhea virus,species,0,
2,A0A8B1JDD3,2697049,[9606],QTZ33356.1,['Homo sapiens'],homo sapiens,Severe acute respiratory syndrome coronavirus 2,no rank,9606,species
3,A0A6G8HJ63,28295,,QIM40847.1,['pig'],pig,Porcine epidemic diarrhea virus,species,0,
4,A0A0H4A793,28295,,AKN45969.1,['swine'],swine,Porcine epidemic diarrhea virus,species,0,


In [32]:
count_df = column_stats_new(df, "uniprot_id", "virus_host_name", "virus_host_tax_id")
count_df

Number of unique values = 133


Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_name_count,virus_host_name_percent
0,alpaca,0,1,0.025478
1,antelope,0,1,0.025478
2,apodemus agrarius,39030,1,0.025478
3,apodemus peninsulae,105297,1,0.025478
4,aselliscus stoliczkanus,188568,1,0.025478
5,bat,0,33,0.840764
6,bat bf_258i,0,2,0.050955
7,bat bf_506i,0,2,0.050955
8,bos taurus,9913,17,0.433121
9,bovine,0,28,0.713376


In [33]:
count_df.to_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_cleanup.csv"), index=False)

### Manual cleanup of 
coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata.csv 

**using**

coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_manual_mapping.csv

In [34]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(3925, 10)


Unnamed: 0,uniprot_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,A0A6G8HJ43,28295,,QIM40843.1,['pig'],pig,Porcine epidemic diarrhea virus,species,0,
1,A0A0G3FCN8,28295,,AKJ85723.1,['swine'],swine,Porcine epidemic diarrhea virus,species,0,
2,A0A8B1JDD3,2697049,[9606],QTZ33356.1,['Homo sapiens'],homo sapiens,Severe acute respiratory syndrome coronavirus 2,no rank,9606,species
3,A0A6G8HJ63,28295,,QIM40847.1,['pig'],pig,Porcine epidemic diarrhea virus,species,0,
4,A0A0H4A793,28295,,AKN45969.1,['swine'],swine,Porcine epidemic diarrhea virus,species,0,


In [35]:
mapping_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/manual_mapping_files/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_manual_mapping.csv")
mapping_df = pd.read_csv(mapping_file_path, encoding="unicode_escape")
mapping_df["mapped_virus_host_name"] = mapping_df["mapped_virus_host_name"].apply(lambda x: unicodedata.normalize("NFKD", x))
print(mapping_df.shape)
mapping_df.head()

(54, 3)


Unnamed: 0,virus_host_name,mapped_virus_host_tax_id,mapped_virus_host_name
0,alpaca,30538,Vicugna pacos
1,bovine,9913,Bos taurus
2,broiler chicken,9031,Gallus gallus
3,broilers,9031,Gallus gallus
4,cat,9685,Felis catus


In [36]:
name_name_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_name"].to_dict()
name_name_map

{'alpaca': 'Vicugna pacos',
 'bovine': 'Bos taurus',
 'broiler chicken': 'Gallus gallus',
 'broilers': 'Gallus gallus',
 'cat': 'Felis catus',
 'cattle': 'Bos taurus',
 'chaerephon plicata': 'Chaerephon plicatus',
 'chicken': 'Gallus gallus',
 'dog': 'Canis lupus',
 'domestic cat': 'Felis catus',
 'domestic pig': 'Sus scrofa',
 'dromedary': 'Camelus dromedarius',
 'dromedary camel': 'Camelus dromedarius',
 'duck': 'Anas platyrhynchos',
 'eidolon helvum (bat)': 'Eidolon helvum',
 'equus caballus adult': 'Equus caballus',
 'felis catus (cat)': 'Felis catus',
 'finishing pig': 'Sus scrofa',
 'giant panda': 'Ailuropoda melanoleuca',
 'giraffe': 'Giraffa camelopardalis',
 'himalayan tahr': 'Hemitragus jemlahicus',
 'homo sapiens sample 4310600064': 'Homo sapiens',
 'homo sapiens sample 4310600157': 'Homo sapiens',
 'homo sapiens sample 4310600253': 'Homo sapiens',
 'homo sapiens sample 4310800120': 'Homo sapiens',
 'hydropotes inermis (water deer)': 'Hydropotes inermis',
 'manis javanica (m

In [37]:
name_id_map = mapping_df.set_index("virus_host_name")["mapped_virus_host_tax_id"].to_dict()
name_id_map

{'alpaca': 30538,
 'bovine': 9913,
 'broiler chicken': 9031,
 'broilers': 9031,
 'cat': 9685,
 'cattle': 9913,
 'chaerephon plicata': 478698,
 'chicken': 9031,
 'dog': 9612,
 'domestic cat': 9685,
 'domestic pig': 9823,
 'dromedary': 9838,
 'dromedary camel': 9838,
 'duck': 8839,
 'eidolon helvum (bat)': 77214,
 'equus caballus adult': 9796,
 'felis catus (cat)': 9685,
 'finishing pig': 9823,
 'giant panda': 9646,
 'giraffe': 9894,
 'himalayan tahr': 37179,
 'homo sapiens sample 4310600064': 9606,
 'homo sapiens sample 4310600157': 9606,
 'homo sapiens sample 4310600253': 9606,
 'homo sapiens sample 4310800120': 9606,
 'hydropotes inermis (water deer)': 9883,
 'manis javanica (malayan pangolin)': 9974,
 'mouse': 10090,
 'mus musculus, severe combined immunedeficiency (scid), female, 6-8 weeks old, liver, sample id: e4m31': 10090,
 'myotis lucifugus (little brown bat)': 59463,
 'neoromicia capensis': 110938,
 'pig': 9823,
 'piglet': 9823,
 'porcine': 9823,
 'rat': 10116,
 'rattus norveg

In [38]:
name_keys = list(name_name_map.keys())
name_keys

['alpaca',
 'bovine',
 'broiler chicken',
 'broilers',
 'cat',
 'cattle',
 'chaerephon plicata',
 'chicken',
 'dog',
 'domestic cat',
 'domestic pig',
 'dromedary',
 'dromedary camel',
 'duck',
 'eidolon helvum (bat)',
 'equus caballus adult',
 'felis catus (cat)',
 'finishing pig',
 'giant panda',
 'giraffe',
 'himalayan tahr',
 'homo sapiens sample 4310600064',
 'homo sapiens sample 4310600157',
 'homo sapiens sample 4310600253',
 'homo sapiens sample 4310800120',
 'hydropotes inermis (water deer)',
 'manis javanica (malayan pangolin)',
 'mouse',
 'mus musculus, severe combined immunedeficiency (scid), female, 6-8 weeks old, liver, sample id: e4m31',
 'myotis lucifugus (little brown bat)',
 'neoromicia capensis',
 'pig',
 'piglet',
 'porcine',
 'rat',
 'rattus norvegicus (norway rat)',
 'rhinilophus ferrumequinum',
 'rhinolophus ferrumequinum (bat)',
 'rhinolophus hipposideros (bat)',
 'rhinolophus sp. (bat)',
 'sambar deer',
 'sucking piglet',
 'suckling pig',
 'sus scrofa (pig)',
 

### Update order: rank, tax_id, name (order is important)
#### 1. Update virus_host_taxon_rank

In [39]:
df["virus_host_taxon_rank"] = df.apply(lambda x: "species" if x["virus_host_name"] in name_keys else x["virus_host_taxon_rank"], axis=1)

#### 2. Update virus_host_tax_id

In [40]:
df["virus_host_tax_id"] = df.apply(lambda x: name_id_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_tax_id"], axis=1)

In [41]:
df[df["virus_host_name"].isin(name_keys)][["virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
0,pig,9823,species
1,swine,9823,species
3,pig,9823,species
4,swine,9823,species
7,pig,9823,species
8,swine,9823,species
9,manis javanica (malayan pangolin),9974,species
11,pig,9823,species
12,swine,9823,species
15,pig,9823,species


#### 3. Update virus_host_name

In [42]:
df["virus_host_name"] = df.apply(lambda x: name_name_map[x["virus_host_name"]] if x["virus_host_name"] in name_keys else x["virus_host_name"], axis=1)

In [43]:
df["virus_host_name"] = df["virus_host_name"].str.lower()
df[["embl_host_name", "virus_host_name", "virus_host_tax_id", "virus_host_taxon_rank"]]

Unnamed: 0,embl_host_name,virus_host_name,virus_host_tax_id,virus_host_taxon_rank
0,['pig'],sus scrofa,9823,species
1,['swine'],sus scrofa,9823,species
2,['Homo sapiens'],homo sapiens,9606,species
3,['pig'],sus scrofa,9823,species
4,['swine'],sus scrofa,9823,species
5,['Rhinolophus affinis'],rhinolophus affinis,59477,species
6,['Homo sapiens'],homo sapiens,9606,species
7,['pig'],sus scrofa,9823,species
8,['swine'],sus scrofa,9823,species
9,['Manis javanica (Malayan pangolin)'],manis javanica,9974,species


In [44]:
print(df.shape)
df.head()

(3925, 10)


Unnamed: 0,uniprot_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank
0,A0A6G8HJ43,28295,,QIM40843.1,['pig'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species
1,A0A0G3FCN8,28295,,AKJ85723.1,['swine'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species
2,A0A8B1JDD3,2697049,[9606],QTZ33356.1,['Homo sapiens'],homo sapiens,Severe acute respiratory syndrome coronavirus 2,no rank,9606,species
3,A0A6G8HJ63,28295,,QIM40847.1,['pig'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species
4,A0A0H4A793,28295,,AKN45969.1,['swine'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species


In [45]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_corrected.csv")
df.to_csv(file_path, index=False)

### coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq.csv

Filter based on host prevalence

In [46]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq.csv")
df = pd.read_csv(file_path)
print(df.shape)
df.head()

(3781, 11)


Unnamed: 0,uniprot_id,tax_id,uniprot_host_tax_ids,embl_ref_id,embl_host_name,virus_host_name,virus_name,virus_taxon_rank,virus_host_tax_id,virus_host_taxon_rank,seq
0,A0A6G8HJ43,28295,,QIM40843.1,['pig'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species,MKSLTYFWLFLPVLSTLSLPQDVTRCSANTNFRRFFSKFNVQAPAV...
1,A0A0G3FCN8,28295,,AKJ85723.1,['swine'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species,MTPLIYFWLFLPVLLTLSLPQDVTRCQSTINFRRFFSKFNVQAPAV...
2,A0A8B1JDD3,2697049,[9606],QTZ33356.1,['Homo sapiens'],homo sapiens,Severe acute respiratory syndrome coronavirus 2,no rank,9606,species,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...
3,A0A6G8HJ63,28295,,QIM40847.1,['pig'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species,MKSLTYFWLFLPVLSTLSLPQDVTRCQSTINFRRFFSKFNVQAPAV...
4,A0A0H4A793,28295,,AKN45969.1,['swine'],sus scrofa,Porcine epidemic diarrhea virus,species,9823,species,MKSLTYFWLFLPVLSTLSLPQDVTRCSANTNFRRFFSKFNVQAPAV...


In [47]:
column_stats(df, "virus_host_name")

Number of unique values = 77
              virus_host_name  virus_host_name_count  virus_host_name_percent
0                  sus scrofa                   1626                43.004496
1                homo sapiens                   1491                39.434012
2                 felis catus                    104                 2.750595
3         camelus dromedarius                    103                 2.724147
4                  bos taurus                     83                 2.195186
5               gallus gallus                     66                 1.745570
6                 canis lupus                     51                 1.348850
7               bos grunniens                     43                 1.137265
8          scotophilus kuhlii                     22                 0.581857
9         rhinolophus sinicus                     14                 0.370272
10               mus musculus                     10                 0.264480
11      tylonycteris pachypus      

Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,sus scrofa,1626,43.004496
1,homo sapiens,1491,39.434012
2,felis catus,104,2.750595
3,camelus dromedarius,103,2.724147
4,bos taurus,83,2.195186
5,gallus gallus,66,1.74557
6,canis lupus,51,1.34885
7,bos grunniens,43,1.137265
8,scotophilus kuhlii,22,0.581857
9,rhinolophus sinicus,14,0.370272


**Number of host classes**

In [48]:
# 1% cutoff
filtered_df = filter_with_threshold(df, "virus_host_name", 1)

Size of df = (3781, 11)
Number of unique values = 77
              virus_host_name  virus_host_name_count  virus_host_name_percent
0                  sus scrofa                   1626                43.004496
1                homo sapiens                   1491                39.434012
2                 felis catus                    104                 2.750595
3         camelus dromedarius                    103                 2.724147
4                  bos taurus                     83                 2.195186
5               gallus gallus                     66                 1.745570
6                 canis lupus                     51                 1.348850
7               bos grunniens                     43                 1.137265
8          scotophilus kuhlii                     22                 0.581857
9         rhinolophus sinicus                     14                 0.370272
10               mus musculus                     10                 0.264480
11      tyl

In [49]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv")
filtered_df["human_binary_label"] = filtered_df.apply(lambda x: "homo sapiens" if x["virus_host_name"] == "homo sapiens" else "NOT homo sapiens", axis=1)
filtered_df.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["human_binary_label"] = filtered_df.apply(lambda x: "homo sapiens" if x["virus_host_name"] == "homo sapiens" else "NOT homo sapiens", axis=1)


## Coronaviridae - S protein - SARS-CoV-2 only

In [53]:
columns = ["uniprot_id", "seq", "virus_name", "virus_host_name", "human_binary_label"]
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/coronaviridae_s_uniprot_uniprot_metadata_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv")
df = pd.read_csv(file_path)
df = df[columns]
print(df.shape)
df.head()

(3567, 5)


Unnamed: 0,uniprot_id,seq,virus_name,virus_host_name,human_binary_label
0,A0A6G8HJ43,MKSLTYFWLFLPVLSTLSLPQDVTRCSANTNFRRFFSKFNVQAPAV...,Porcine epidemic diarrhea virus,sus scrofa,NOT homo sapiens
1,A0A0G3FCN8,MTPLIYFWLFLPVLLTLSLPQDVTRCQSTINFRRFFSKFNVQAPAV...,Porcine epidemic diarrhea virus,sus scrofa,NOT homo sapiens
2,A0A8B1JDD3,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
3,A0A6G8HJ63,MKSLTYFWLFLPVLSTLSLPQDVTRCQSTINFRRFFSKFNVQAPAV...,Porcine epidemic diarrhea virus,sus scrofa,NOT homo sapiens
4,A0A0H4A793,MKSLTYFWLFLPVLSTLSLPQDVTRCSANTNFRRFFSKFNVQAPAV...,Porcine epidemic diarrhea virus,sus scrofa,NOT homo sapiens


In [54]:
column_stats(df, "virus_name")

Number of unique values = 58
                                           virus_name  virus_name_count  virus_name_percent
0                     Porcine epidemic diarrhea virus              1544           43.285674
1     Severe acute respiratory syndrome coronavirus 2              1018           28.539389
2   Middle East respiratory syndrome-related coron...               267            7.485282
3                              Human coronavirus OC43               182            5.102327
4                                  Feline coronavirus                83            2.326885
5                                  Bovine coronavirus                76            2.130642
6                         Infectious bronchitis virus                62            1.738155
7                                  Canine coronavirus                51            1.429773
8                              Human coronavirus NL63                45            1.261564
9                                     Yak coronavir

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Porcine epidemic diarrhea virus,1544,43.285674
1,Severe acute respiratory syndrome coronavirus 2,1018,28.539389
2,Middle East respiratory syndrome-related coron...,267,7.485282
3,Human coronavirus OC43,182,5.102327
4,Feline coronavirus,83,2.326885
5,Bovine coronavirus,76,2.130642
6,Infectious bronchitis virus,62,1.738155
7,Canine coronavirus,51,1.429773
8,Human coronavirus NL63,45,1.261564
9,Yak coronavirus,43,1.205495


In [55]:
sarscov2_df = df[df["virus_name"] == "Severe acute respiratory syndrome coronavirus 2"]
print(sarscov2_df.shape)
sarscov2_df.head()

(1018, 5)


Unnamed: 0,uniprot_id,seq,virus_name,virus_host_name,human_binary_label
2,A0A8B1JDD3,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
5,A0A8B1JDG7,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
8,A0A8B1JDH2,MFVFLVLLPLVSIQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
11,A0A8B1JDK8,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
14,A0A8B1JDP8,VSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFL...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens


In [56]:
column_stats(sarscov2_df, "virus_host_name")

Number of unique values = 2
  virus_host_name  virus_host_name_count  virus_host_name_percent
0    homo sapiens                   1017                99.901768
1     felis catus                      1                 0.098232


Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,homo sapiens,1017,99.901768
1,felis catus,1,0.098232


In [58]:
sarscov2_df = sarscov2_df[sarscov2_df["virus_host_name"] == "homo sapiens"]
column_stats(sarscov2_df, "virus_host_name")

Number of unique values = 1
  virus_host_name  virus_host_name_count  virus_host_name_percent
0    homo sapiens                   1017                    100.0


Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,homo sapiens,1017,100.0


#### Add WIV04 to the dataset for alignment reference

In [61]:
wiv04_df = pd.read_csv(os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/wiv04/sarscov2-S-WIV04ref.csv"))
wiv04_df.rename(columns={"uniref90_id": "uniprot_id"}, inplace=True)
wiv04_df = wiv04_df[columns]
wiv04_df.head()

Unnamed: 0,uniprot_id,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens


In [62]:
sarscov2_df = pd.concat([wiv04_df, sarscov2_df])
print(sarscov2_df.shape)
sarscov2_df.head()

(1018, 5)


Unnamed: 0,uniprot_id,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens
2,A0A8B1JDD3,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
5,A0A8B1JDG7,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
8,A0A8B1JDH2,MFVFLVLLPLVSIQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
11,A0A8B1JDK8,SQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens


In [65]:
file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniprot/sars_cov_2/coronaviridae_s_uniprot_sars_cov_2.csv")
sarscov2_df.to_csv(file_path, index=False)

In [66]:
df["human_binary_label"].value_counts()

NOT homo sapiens    2076
homo sapiens        1491
Name: human_binary_label, dtype: int64