In [1]:
! pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from ast import literal_eval
import unicodedata
import matplotlib.pyplot as plt
from Bio import SeqIO
pd.set_option("display.max_rows", None)
pd.set_option('display.width', 1000)

In [3]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), "..", "..", "..", "..", ".."))
sys.path.append(os.path.join(os.getcwd(), "..", "..", "..", ".."))
sys.path.append(os.path.join(os.getcwd(), "..", "..", ".."))
sys.path.append(os.path.join(os.getcwd(), "..", ".."))
sys.path

['/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation',
 '/opt/conda/lib/python38.zip',
 '/opt/conda/lib/python3.8',
 '/opt/conda/lib/python3.8/lib-dynload',
 '',
 '/home/blessyantony/.local/lib/python3.8/site-packages',
 '/opt/conda/lib/python3.8/site-packages',
 '/opt/conda/lib/python3.8/site-packages/IPython/extensions',
 '/home/blessyantony/.ipython',
 '/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../../../..',
 '/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../../..',
 '/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../..',
 '/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../..']

In [4]:
def column_stats(df, column_name, n=None):
    if n is None:
        n = df.shape[0]
    count_column_name = column_name + "_count"
    count_df = pd.DataFrame(df[column_name].value_counts()).reset_index()
    count_df.columns=[column_name, count_column_name]
    count_df[column_name + "_percent"] = count_df[count_column_name].apply(lambda x: int(x)/n*100)
    print(f"Number of unique values = {len(df[column_name].unique())}")
    print(f"{count_df}")
    return count_df

In [5]:
def print_seq_len_histogram(df, n_bins):
    sns.histplot(df["seq_len"])
    print(f"min seq len = {min(df['seq_len'])}")
    print(f"max seq len = {max(df['seq_len'])}")
    plt.show()
    
    freq, bins = np.histogram(df["seq_len"], bins=n_bins)
    n = df.shape[0]
    hist_map = []
    for i in range(n_bins):
        hist_map.append({"start": bins[i], "end":bins[i+1], "count": freq[i], "percentage": freq[i]/n*100})
    hist_df = pd.DataFrame(hist_map)
    print(hist_df)

In [12]:
def parse_uniref_fasta_file(input_file_path):
    sequences = []
    i = 0
    print("START: Parsing fasta file")
    print(input_file_path)
    # parse fasta file to extract uniref90_id, tax_id of virus/organism, and protein sequence
    with open(input_file_path) as f:
        for record in SeqIO.parse(f, "fasta"):
            i += 1
            sequences.append({"uniref90_id": record.id, "seq": str(record.seq)})

    print("END: Parsing fasta file")
    print(len(sequences))
    print(f"Number of records parsed = {i}")
    return pd.DataFrame(sequences)
    

columns = ["uniref90_id", "seq", "virus_name", "virus_host_name", "human_binary_label"]


In [13]:
wiv04_sequence_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/wiv04/sarscov2-S-WIV04ref.csv")
wiv04_df = pd.read_csv(wiv04_sequence_file_path)[columns]
# wiv04_df["virus_host_name"] = wiv04_df["virus_host_name"].apply(lambda x: unicodedata.normalize("NFKD", x))
# wiv04_df["human_binary_label"] = wiv04_df["human_binary_label"].apply(lambda x: unicodedata.normalize("NFKD", x))
wiv04_df.head()

Unnamed: 0,uniref90_id,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens


In [18]:
def get_aligned_dataset(unaligned_file_path, aligned_file_path):
    print(unaligned_file_path)
    unaligned_df = pd.read_csv(unaligned_file_path)[columns]
    print(f"unaligned_df size = {unaligned_df.shape}")
    print(f"Adding wiv04 to unaligned dataset")
    unaligned_df = pd.concat([unaligned_df, wiv04_df])
    print(f"unaligned_df size = {unaligned_df.shape}")
    aligned_df = parse_uniref_fasta_file(aligned_file_path)
    print(f"aligned_df size = {aligned_df.shape}")
    aligned_df.rename(columns={"seq": "aligned_seq"}, inplace=True)
    df = aligned_df.merge(unaligned_df[columns], how="left", on="uniref90_id")
    print(f"merged df size = {df.shape}")
    return df

# Coronaviridae S Protein UniRef90 - EMBL mapping Aligned Dataset Generation
## Coronaviridae S Protein

### coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.fasta

In [19]:
aligned_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniref/alignment/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.fasta")
unaligned_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniref/alignment/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv")
df = get_aligned_dataset(unaligned_file_path, aligned_file_path)
df.head()

/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../../../input/data/coronaviridae/20240313/uniref/aligned/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8.csv
unaligned_df size = (681, 5)
Adding wiv04 to unaligned dataset
unaligned_df size = (682, 5)
START: Parsing fasta file
/home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../../../input/data/coronaviridae/20240313/uniref/aligned/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.fasta
END: Parsing fasta file
682
Number of records parsed = 682
aligned_df size = (682, 2)
merged df size = (682, 6)


Unnamed: 0,uniref90_id,aligned_seq,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens
1,UniRef90_A0A7U3RIT3,--------------MFVFLVLVPLVSS--------Q----------...,MFVFLVLVPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
2,UniRef90_A0A7U3HGG2,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
3,UniRef90_A0A7U3EEN6,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
4,UniRef90_A0A7U3HDM5,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens


In [20]:
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniref/alignment/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.csv")
df.to_csv(output_file_path, index=False)

In [21]:
column_stats(df, "virus_host_name")

Number of unique values = 8
       virus_host_name  virus_host_name_count  virus_host_name_percent
0        gallus gallus                    332                48.680352
1         homo sapiens                    164                24.046921
2          felis catus                     68                 9.970674
3           sus scrofa                     54                 7.917889
4          canis lupus                     33                 4.838710
5      rhinolophus sp.                     11                 1.612903
6     mustela putorius                     11                 1.612903
7  rhinolophus sinicus                      9                 1.319648


Unnamed: 0,virus_host_name,virus_host_name_count,virus_host_name_percent
0,gallus gallus,332,48.680352
1,homo sapiens,164,24.046921
2,felis catus,68,9.970674
3,sus scrofa,54,7.917889
4,canis lupus,33,4.83871
5,rhinolophus sp.,11,1.612903
6,mustela putorius,11,1.612903
7,rhinolophus sinicus,9,1.319648


## Coronaviridae S protein - SARS-CoV-2 only
### using coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.csv

In [22]:
cov_s_aligned_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/uniref/alignment/coronaviridae_s_uniref90_embl_hosts_pruned_metadata_corrected_species_virus_host_vertebrates_w_seq_t0.01_c8_aligned.csv")
cov_s_aligned_df = pd.read_csv(cov_s_aligned_file_path)
print(cov_s_aligned_df.shape)
cov_s_aligned_df.head()

(682, 6)


Unnamed: 0,uniref90_id,aligned_seq,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens
1,UniRef90_A0A7U3RIT3,--------------MFVFLVLVPLVSS--------Q----------...,MFVFLVLVPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
2,UniRef90_A0A7U3HGG2,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
3,UniRef90_A0A7U3EEN6,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
4,UniRef90_A0A7U3HDM5,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens


In [23]:
column_stats(cov_s_aligned_df, "virus_name")

Number of unique values = 38
                                           virus_name  virus_name_count  virus_name_percent
0                         Infectious bronchitis virus               271           39.736070
1     Severe acute respiratory syndrome coronavirus 2                93           13.636364
2                                  Feline coronavirus                63            9.237537
3                                   Avian coronavirus                60            8.797654
4                              Human coronavirus OC43                51            7.478006
5                     Porcine epidemic diarrhea virus                47            6.891496
6                                  Canine coronavirus                17            2.492669
7                                  Alphacoronavirus 1                15            2.199413
8                                       Coronaviridae                 8            1.173021
9   Swine acute diarrhea syndrome related coronavir

Unnamed: 0,virus_name,virus_name_count,virus_name_percent
0,Infectious bronchitis virus,271,39.73607
1,Severe acute respiratory syndrome coronavirus 2,93,13.636364
2,Feline coronavirus,63,9.237537
3,Avian coronavirus,60,8.797654
4,Human coronavirus OC43,51,7.478006
5,Porcine epidemic diarrhea virus,47,6.891496
6,Canine coronavirus,17,2.492669
7,Alphacoronavirus 1,15,2.199413
8,Coronaviridae,8,1.173021
9,Swine acute diarrhea syndrome related coronavirus,8,1.173021


In [24]:
sarscov2_virus_names = ["Severe acute respiratory syndrome coronavirus 2", "WIV04(MN996528.1) Wuhan variant index virus"]
sarscov2_aligned_df = cov_s_aligned_df[cov_s_aligned_df["virus_name"].isin(sarscov2_virus_names)]
print(sarscov2_aligned_df.shape)
sarscov2_aligned_df.head()

(94, 6)


Unnamed: 0,uniref90_id,aligned_seq,seq,virus_name,virus_host_name,human_binary_label
0,WIV04,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,WIV04(MN996528.1) Wuhan variant index virus,homo sapiens,homo sapiens
1,UniRef90_A0A7U3RIT3,--------------MFVFLVLVPLVSS--------Q----------...,MFVFLVLVPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
2,UniRef90_A0A7U3HGG2,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
3,UniRef90_A0A7U3EEN6,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens
4,UniRef90_A0A7U3HDM5,--------------MFVFLVLLPLVSS--------Q----------...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,Severe acute respiratory syndrome coronavirus 2,homo sapiens,homo sapiens


0    1273
1    1273
2    1273
3    1273
4    1273
Name: seq_len, dtype: int64

In [25]:
aligned_csv_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/20240313/sarscov2/aligned/uniref90/coronaviridae_s_uniref90_sars_cov_2_aligned.csv")
sarscov2_aligned_df.to_csv(aligned_csv_file_path, index=False)