In [None]:
import re
from pathlib import Path
import pandas as pd
from generalutils import read_fasta, read_metadata, write_fasta

In [None]:
meta_df = read_metadata(Path("../../PV1 Data/PV1_meta_2020-11-23.tsv").resolve())
meta_df.head()

In [None]:
meta_df.isna().sum()

In [None]:
meta_df[meta_df.isna().any(axis=1)]

In [None]:
betacov1 = meta_df[meta_df["Species"] == "Betacoronavirus 1"]
betacov1.head()

In [None]:
sars_related_cov = meta_df[meta_df["Species"] == "Severe acute respiratory syndrome-related coronavirus"]
sars_related_cov.head()

In [None]:
merged_df = pd.concat([betacov1, sars_related_cov], ignore_index=True)

In [None]:
merged_df["FullName"] = merged_df["FullName"].str.replace(
    r"(\bOXX=)([^\s]+)", 
    lambda m: m.group(1) + ",".join(part.split("_", 1)[0] for part in m.group(2).split(",")), 
    regex=True
)
merged_df.head()

In [None]:
merged_df.to_csv(Path("./data/sars_rel_cov_and_betacov1_metadata.csv"), index=False)

In [None]:
two_species_df = pd.read_csv(Path("./data/sars_rel_cov_and_betacov1_metadata.csv"))
two_species_df.head()

In [None]:
(two_species_df["Species"] == "Betacoronavirus 1").sum()

In [None]:
(two_species_df["Species"] == "Severe acute respiratory syndrome-related coronavirus").sum()

In [None]:
fasta_df = read_fasta(Path("../../PV1 Data/TargetSequences/fulldesign_2019-02-27_wGBKsw.fasta").resolve(), full_name=True)
fasta_df.head()

In [None]:
names = set(two_species_df["FullName"].dropna().unique())
matched = fasta_df[fasta_df["FullName"].isin(names)].copy()
matched.head()

In [None]:
matched.shape

In [None]:
write_fasta(matched, Path("./data/sars_rel_cov_and_betacov1_913.fasta"))