In [None]:
from pathlib import Path
import re
import pandas as pd
import numpy as np

In [None]:
def read_fasta(fasta_path, full_name=False):
    header_pattern = re.compile(r"^>ID=([^\s]+)\s+AC=([^\s]+)\s+OXX=([^\s]+)\s*$")
    rows = []
    curr = None
    with open(fasta_path, "r", encoding="utf-8") as fasta:
        for raw in fasta:
            line = raw.strip()

            if not line:
                continue

            if line.startswith(">"):
                if curr:
                    curr["Sequence"] = "".join(curr["Sequence"])
                    rows.append(curr)
                
                if not full_name:
                    match_ = header_pattern.match(line)
                    if not match_:
                        raise ValueError(f"Header does not match expected format: '{line}'")

                    curr = {"ID": match_.group(1), 
                            "AC": match_.group(2), 
                            "OXX": match_.group(3), 
                            "Sequence": []}
                else:
                    line = line.split(">")[1]
                    curr = {"FullName": line, 
                            "Sequence": []}
            else:
                if curr is None:
                    raise ValueError("Found sequence before any header")
                curr["Sequence"].append(line)

    if curr:
        curr["Sequence"] = "".join(curr["Sequence"])
        rows.append(curr)

    if not full_name:
        return pd.DataFrame(rows, columns=["ID", "AC", "OXX", "Sequence"])
    else: 
        return pd.DataFrame(rows, columns=["FullName", "Sequence"])

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [None]:
meta_path = Path("./PV1_meta_2020-11-23.tsv")
meta_df = pd.read_csv(meta_path, sep="\t")
meta_df.head()

In [None]:
meta_df["Category"].unique()

In [None]:
meta_df[meta_df["Category"] == "SetCover"].head()

In [None]:
meta_df[meta_df["Category"] == "PositiveControl"].head()

In [None]:
meta_df[meta_df["Category"] == "NegativeControl"].head()

In [None]:
# ignoring controls
setcover_df = meta_df[meta_df["Category"] == "SetCover"]
setcover_df.head()

In [None]:
fasta_path = Path("./fulldesign_2019-02-27_wGBKsw.fasta")
fasta_df = read_fasta(fasta_path, full_name=True)
fasta_df.head()

In [None]:
oxx_match = setcover_df["FullName"].str.extract(
    r"OXX=[^\s,]+(?:,[^\s,]+)*_(?P<AlignStart>\d+)_(?P<AlignStop>\d+)" 
)
oxx_match.head()

In [None]:
setcover_df["AlignStart"] = (pd.to_numeric(oxx_match["AlignStart"], errors="coerce")
                                .fillna(setcover_df["AlignStart"])
                                .astype("Int64"))
setcover_df["AlignStop"] = (pd.to_numeric(oxx_match["AlignStop"], errors="coerce")
                                .fillna(setcover_df["AlignStop"])
                                .astype("Int64"))
setcover_df.head()

In [None]:
setcover_df["FullName"] = setcover_df["FullName"].str.replace(
    r"(\bOXX=)([^\s]+)", 
    lambda m: m.group(1) + ",".join(part.split("_", 1)[0] for part in m.group(2).split(",")), 
    regex=True
)
setcover_df.head()

In [None]:
merged = setcover_df.merge(fasta_df[["FullName", "Sequence"]], 
                           on="FullName", 
                           how="left", 
                           validate="m:1")
merged["Protein"] = merged["Sequence"]
merged.drop(columns=["Sequence"], inplace=True)
merged.head()

In [None]:
merged_path = Path("./PV1_meta_fixed.tsv")
merged.to_csv(merged_path, sep="\t", index=False)