In [1]:
import re
from pathlib import Path
import pandas as pd
from generalutils import read_fasta, read_metadata, write_fasta

In [None]:
meta_df = read_metadata(Path("../../PV1 Data/PV1_meta_2020-11-23.tsv").resolve())
meta_df.head()

In [None]:
meta_df.isna().sum()

In [None]:
meta_df[meta_df.isna().any(axis=1)]

In [None]:
betacov1 = meta_df[meta_df["Species"] == "Betacoronavirus 1"]
betacov1.head()

In [None]:
sars_related_cov = meta_df[meta_df["Species"] == "Severe acute respiratory syndrome-related coronavirus"]
sars_related_cov.head()

In [None]:
merged_df = pd.concat([betacov1, sars_related_cov], ignore_index=True)

In [None]:
merged_df["FullName"] = merged_df["FullName"].str.replace(
    r"(\bOXX=)([^\s]+)", 
    lambda m: m.group(1) + ",".join(part.split("_", 1)[0] for part in m.group(2).split(",")), 
    regex=True
)
merged_df.head()

In [None]:
merged_df.to_csv(Path("./data/sars_rel_cov_and_betacov1_metadata.csv"), index=False)

In [2]:
two_species_df = pd.read_csv(Path("./data/sars_rel_cov_and_betacov1_metadata.csv"))
two_species_df.head()

Unnamed: 0,CodeName,Category,SpeciesID,Species,FullName,Peptide,Encoding
0,PV1_038394,SetCover,694003,Betacoronavirus 1,"ID=SPIKE_CVHOC AC=P36334;Q66199;Q66290;Q66291;Q696Q6;Q6TNF9;Q86623 OXX=31631,694003,694002,11118",IDTTATSCQLYYNLPAANVSVSRFNPSTWN,ATCGACACCACCGCAACCTCCTGTCAGTTGTACTACAACCTGCCGGCTGCAAACGTCTCTGTATCCCGTTTTAACCCGTCTACGTGGAAC
1,PV1_038387,SetCover,694003,Betacoronavirus 1,"ID=SPIKE_CVHOC AC=P36334;Q66199;Q66290;Q66291;Q696Q6;Q6TNF9;Q86623 OXX=31631,694003,694002,11118",IFVEVNATYYNSWQNLLYDSNGNLYGFRDY,ATCTTTGTAGAAGTAAACGCAACCTACTACAACTCTTGGCAGAACCTGCTGTACGATTCCAACGGCAACCTGTACGGTTTTCGCGACTAC
2,PV1_038395,SetCover,694003,Betacoronavirus 1,"ID=SPIKE_CVP67 AC=Q8BB25 OXX=230237,694003,694002,11118",AQIDRLINGRLTALNAYVSQQLSDSTLVKF,GCTCAGATAGACCGCCTGATCAACGGCCGCCTGACTGCGCTGAATGCGTACGTTTCTCAACAGCTGTCTGACTCCACCCTGGTAAAATTT
3,PV1_039220,SetCover,694003,Betacoronavirus 1,"ID=B9TXV2_9BETC AC=B9TXV2 OXX=11128,694003,694002,11118",SASLFPPWTAAAGVPFYLNVQYRINGIGVT,TCTGCGTCCCTGTTCCCGCCGTGGACTGCAGCTGCTGGTGTACCGTTCTACCTGAACGTACAATACCGTATAAACGGTATTGGTGTTACA
4,PV1_038562,SetCover,694003,Betacoronavirus 1,"ID=SPIKE_CVBEN AC=Q91A26 OXX=233262,694003,694002,11118",GIGTCPAGTNYLTCHNAAQCDCLCTPDPIT,GGTATCGGTACGTGCCCTGCGGGTACCAACTATTTGACCTGTCATAACGCAGCACAGTGCGACTGTCTGTGCACTCCGGACCCGATCACC


In [None]:
(two_species_df["Species"] == "Betacoronavirus 1").sum()

In [None]:
(two_species_df["Species"] == "Severe acute respiratory syndrome-related coronavirus").sum()

In [3]:
fasta_df = read_fasta(Path("../../PV1 Data/TargetSequences/fulldesign_2019-02-27_wGBKsw.fasta").resolve(), full_name=True)
fasta_df.head()

Unnamed: 0,FullName,Sequence
0,"ID=J9Z4E7_9ADEN AC=J9Z4E7 OXX=129951,129951,10509,10508",MSNSSNSTSLSNFSGIGVGVILTLVILFILILALLCLRVAACCTHVCTYCQLFKRWGQHPR
1,"ID=A8D0M1_ADE02 AC=A8D0M1 OXX=10515,129951,10509,10508",MALTCRLRFPVPGFRGRMHRRRGMAGHGLTGGMRRAHHRRRRASHRRMRGGILPLLIPLIAAAIGAVPGIASVALQAQRH
2,"ID=Q779F4_ADE02 AC=Q779F4 OXX=10515,129951,10509,10508",MIPRVLILLTLVALFCACSTLAAVAHIEVDCIPPFTVYLLYGFVTLILICSLVTVVIAFIQFIDWVCVRIAYLRHHPQYRDRTIADLLRIL
3,"ID=Q2KRZ6_ADE05 AC=Q2KRZ6 OXX=28285,129951,10509,10508",MIPRVFILLTLVALFCACSTLAAVSHIEVDCIPAFTVYLLYGFVTLTLICSLITVVIAFIQCIDWVCVRFAYLRHHPQYRDRTIAELLRIL
4,"ID=E1ARS2_9ADEN AC=E1ARS2 OXX=129951,129951,10509,10508",MIPRVLILLTLVALFCACSTLAAVAHIEVDCIPPFTVYLLYGFVTLILICSLVTVVIAFIQFIDWICVRIAYLRHHPQYRDRTIADLLRIL


In [4]:
names = set(two_species_df["FullName"].dropna().unique())
matched = fasta_df[fasta_df["FullName"].isin(names)].copy()
matched.head()

Unnamed: 0,FullName,Sequence
65131,"ID=Q76XZ7_CVHSA AC=Q76XZ7 OXX=230471,694009,694002,11118",MKLLIVLTCISLCSCICTVVQRCASNKPHVLEDPCKVQH
65132,"ID=A0A0U1WHK0_CVHSA AC=A0A0U1WHK0 OXX=1503301,694009,694002,11118",MNELTLIDFYLCFLAFLLFLVLIMLLIFWFSLEIQDIEEPCNKV
65133,"ID=A0A0U1WHH4_CVHSA AC=A0A0U1WHH4 OXX=1503299,694009,694002,11118",MNELTLIDFYLCFLAFLLFLVLIMLIIFWFSLELQDIEEPCNKVFETL
65134,"ID=B8Q8W5_CVHSA AC=B8Q8W5 OXX=511429,694009,694002,11118",MNELTLIDFYLCFLAFLLFLVLIMLIIFWFSLEIQDLEEPCTKVYHVLELSRSS
65135,"ID=Q0QDX5_CVHSA AC=Q0QDX5 OXX=347536,694009,694002,11118",MFHLVDFQVTIAEILIIIMKTFRVAIWNLDILISSIVRQLFKPLTKKKYSELDDEEPMELDYP


In [5]:
matched.shape

(913, 2)

In [6]:
write_fasta(matched, Path("./data/sars_rel_cov_and_betacov1_913.fasta"))