In [28]:
from Bio import SeqIO
from pathlib import Path
from Bio.SeqRecord import SeqRecord
import pandas as pd
import re
import os


identifier = r"CYP[\d+][A-Z+][\d+]"
# pattern_family = re.compile(r"(?<=CYP)[\d]+")
pattern_id = re.compile(r"(?<=CYP)[\d]+[A-Z]+[\d]+")
pattern_fam = re.compile(r"^[\d]+")
pattern_subfam = re.compile(r"[A-Z]+")
pattern_gene = re.compile(r"[\d]+$")

test = "CYP44LK4"
# print(re.search(pattern_family, test).group())
print(re.search(pattern_id, test).group())
test = "44LK4"
print(re.search(pattern_fam, test).group())
print(re.search(pattern_subfam, test).group())
print(re.search(pattern_gene, test).group())


def match_or_none(pattern, s):
    try:
        out = re.search(pattern, s).group()
    except:
        out = None
    return out

44LK4
44
LK
4


In [29]:
data_dir = Path("/hits/fast/cme/bodynems/data")
seqs: list[SeqRecord] = list(
    SeqIO.parse(data_dir / "backup" / "arthropod.fasta", format="fasta")
)
print(len(seqs))

stats = []
for seq in seqs:
    stats.append([seq.id, len(seq)])

stats_df = pd.DataFrame(stats, columns=["id_full", "length"])

11118


In [30]:
stats_df["id"] = stats_df["id_full"].map(lambda s: match_or_none(pattern_id, s))
stats_df["family"] = stats_df["id"].map(lambda s: match_or_none(pattern_fam, s))
stats_df["subfamily"] = stats_df["id"].map(lambda s: match_or_none(pattern_subfam, s))
stats_df["gene"] = stats_df["id"].map(lambda s: match_or_none(pattern_gene, s))


families = stats_df["family"].value_counts()
families = families[families > 4].index

id_map = dict(zip([seq.id for seq in seqs], stats_df["family"]))

seq_groups = {}
for family in families:
    seq_groups[family] = list(filter(lambda seq: id_map[seq.id] == family, seqs))

out_dir = data_dir / "paper" / "arthropod"
os.makedirs(out_dir, exist_ok=True)

In [31]:
from tqdm import tqdm

for family, group in tqdm(seq_groups.items()):
    dataset_dir = out_dir / f"CYP_{family}"
    os.makedirs(dataset_dir)
    SeqIO.write(group, dataset_dir / "sequences.fasta", "fasta")

100%|██████████| 237/237 [00:00<00:00, 309.72it/s]
