In [None]:
import numpy as np
from Bio import SeqIO
from collections import Counter
from pathlib import Path

PROJECT_ROOT = Path('/content')
POS_FASTA = PROJECT_ROOT / 'reps_30_rep_seq_pos.fasta'
NEG_FASTA = PROJECT_ROOT / 'reps_30_rep_seq_neg.fasta'
if not POS_FASTA.exists() or not NEG_FASTA.exists():
    PROJECT_ROOT = Path('..').resolve()
    POS_FASTA = PROJECT_ROOT / 'reps_30_rep_seq_pos.fasta'
    NEG_FASTA = PROJECT_ROOT / 'reps_30_rep_seq_neg.fasta'
if not POS_FASTA.exists() or not NEG_FASTA.exists():
    raise FileNotFoundError('Missing FASTA files for manual features')
print('Loading sequences from', POS_FASTA.parent)

# Load protein sequences from FASTA files
def load_sequences(fasta_file):
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(record.seq))
    return sequences

# Define the feature computation function
def compute_manual_features(sequence):
    # Amino Acid Composition (ACC)
    def compute_acc(sequence):
        amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        counts = Counter(sequence)
        acc = np.array([counts[aa] / len(sequence) for aa in amino_acids])
        return acc

    # Composition of K-Spaced Amino Acid Pairs (CKSAAP)
    def compute_cksaap(sequence, k=2):
        amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        pairs = [a1 + a2 for a1 in amino_acids for a2 in amino_acids]
        cksaap = []
        if len(sequence) <= k + 1:
            return np.zeros(len(pairs) * (k + 1), dtype=float)
        for i in range(k + 1):
            window = len(sequence) - i - 1
            if window <= 0:
                cksaap.extend([0.0 for _ in pairs])
                continue
            counts = Counter([sequence[j] + sequence[j + i + 1] for j in range(window)])
            cksaap.extend([counts.get(pair, 0) / window for pair in pairs])
        return np.array(cksaap)

    # Dipeptide Deviation from Expected Mean (DDE)
    def compute_dde(sequence):
        amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        counts = Counter(sequence)
        total_pairs = len(sequence) - 1
        if total_pairs <= 0:
            return np.zeros(len(amino_acids) ** 2, dtype=float)
        dde = []
        for aa1 in amino_acids:
            for aa2 in amino_acids:
                observed = sequence.count(aa1 + aa2)
                expected = (counts.get(aa1, 0) / len(sequence)) * (counts.get(aa2, 0) / len(sequence)) * total_pairs
                dde.append((observed - expected) / total_pairs)
        return np.array(dde)

    # Grouped Tripeptide Composition (GTPC)
    def compute_gtpc(sequence):
        groups = {
            'A': 'GAVLMI',  # Non-polar
            'B': 'FWY',     # Aromatic
            'C': 'KRH',     # Positively charged
            'D': 'DE',      # Negatively charged
            'E': 'STNQ',    # Polar, uncharged
        }
        group_keys = list(groups.keys())
        total_tris = len(sequence) - 2
        if total_tris <= 0:
            return np.zeros(len(group_keys), dtype=float)
        group_counts = {g: 0 for g in group_keys}
        for i in range(len(sequence) - 2):
            tri = sequence[i:i + 3]
            for g in group_keys:
                if all(char in groups[g] for char in tri):
                    group_counts[g] += 1
        gtpc = np.array([group_counts[g] / total_tris for g in group_keys])
        return gtpc

    # Concatenate all features
    # acc = compute_acc(sequence)
    # cksaap = compute_cksaap(sequence)
    # dde = compute_dde(sequence)
    gtpc = compute_gtpc(sequence)
    return np.concatenate([gtpc])

# Load sequences from files
positive_sequences = load_sequences(POS_FASTA)
negative_sequences = load_sequences(NEG_FASTA)

# Assign labels: 1 for positive, 0 for negative
sequences = positive_sequences + negative_sequences
labels = [1] * len(positive_sequences) + [0] * len(negative_sequences)

# Compute manual features for all sequences
X = np.array([compute_manual_features(seq) for seq in sequences])
y = np.array(labels)

OUT_DIR = PROJECT_ROOT / 'manual_features'
OUT_DIR.mkdir(parents=True, exist_ok=True)
np.save(OUT_DIR / 'manu_features_gtpc.npy', X)
np.save(OUT_DIR / 'manu_labels_gtpc.npy', y)
print('Feature extraction completed and saved to', OUT_DIR)
