# CleavAI

© 2025 JForCell Corporation. All Rights Reserved.

In [1]:
import pandas as pd
import numpy as np
import random

## Compose dataset

In [2]:
df_pos = pd.read_csv("./furin_cleavage_windows.csv")
df_pos["label"] = 1

In [3]:
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"

def random_peptide(window=8):
    while True:
        pep = ''.join(random.choices(AMINO_ACIDS, k=window))
        if pep[3] == 'R':  # P1 position
            return pep

neg_peptides = [random_peptide() for _ in range(len(df_pos))]

df_neg = pd.DataFrame({
    "cleavage_window": neg_peptides,
    "label": 0
})

In [None]:
df_all = pd.concat([df_pos[["cleavage_window", "label"]], df_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)

In [5]:
def one_hot_encode(seq):
    aa_to_idx = {aa: i for i, aa in enumerate(AMINO_ACIDS)}
    encoding = np.zeros((8, 20), dtype=int)
    for i, aa in enumerate(seq):
        if aa in aa_to_idx:
            encoding[i, aa_to_idx[aa]] = 1
    return encoding

X = np.stack(df_all["cleavage_window"].map(one_hot_encode))
y = df_all["label"].values

## Testing simple classification models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_flat = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))