# CleavAI

© 2025 JForCell Corporation. All Rights Reserved.

- Synthetic negatives: totally random but P1=R
- Hard negatives: biologically realistic motifs, but Furin did not cleave

In [24]:
import pandas as pd
from sqlalchemy import create_engine
import random

In [25]:
engine = create_engine(f"mysql+pymysql://jforcell:jforcell@localhost/merops")

In [26]:
furin_mernums = (
    'MER0000375', 'MER0000381', 'MER0000377', 'MER0000383', 'MER0002984', 'MER0002578', 'MER0000964', 'MER0004695'
)

query = f"""
SELECT s.uniprot_acc, s.sequence
FROM substrate s
WHERE LOWER(s.uniprot_acc) NOT IN (
    SELECT LOWER(c.uniprot_acc)
    FROM cleavage c
    WHERE c.mernum IN {furin_mernums}
)
AND CHAR_LENGTH(s.sequence) > 50;
"""

df_nonfurin = pd.read_sql(query, engine)

In [27]:
df_nonfurin.head()

Unnamed: 0,uniprot_acc,sequence
0,A0A023PXC2,MLPLCLTFLSFFLSLGGSFKAVMTKEEADGTTEAAACLFWIFNWTV...
1,A0A023PXD5,MMTAAKRLGLYSALRACSATVFRSNLHPKVTVATMFCSVGTIPDVA...
2,A0A023PXI4,MYWPCLVITPFTVGESFCLLLSLGIPLDTGILNIWSLSSISRHLEK...
3,A0A023PYC6,MVSSFFMASTLLAISSCFNSSISRAKGYNDSLESESLEFDVVDVVD...
4,A0A023PYD9,MCGVVVVIVALVPADPLLPAFACGCSCDAPVFIPFFNISSSIILIC...


In [28]:
def extract_windows(seq, window=8):
    return [
        seq[i:i+window]
        for i in range(len(seq) - window + 1)
        if seq[i+3] == 'R' and 'X' not in seq[i:i+window]
    ]

all_neg = set()
for _, row in df_nonfurin.iterrows():
    all_neg.update(extract_windows(row["sequence"]))

df_hard = pd.DataFrame({"cleavage_window": list(all_neg)})
df_hard.head()

Unnamed: 0,cleavage_window
0,GKPRYAET
1,DMGRSFLQ
2,VNKRKSKY
3,LKTRTRRR
4,APVRLPLE


In [29]:
df_hard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016210 entries, 0 to 1016209
Data columns (total 1 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   cleavage_window  1016210 non-null  object
dtypes: object(1)
memory usage: 7.8+ MB


### Undersampling

In [30]:
desired_neg = 809 * 3 # 1:3 ratio with positive samples
df_sampled = df.sample(n=desired_neg, random_state=42)

In [31]:
# df_neg = pd.concat([df_hard, df_synth], ignore_index=True).drop_duplicates()
df = df_sampled.copy()
df["label"] = 0
df[["cleavage_window", "label"]].to_csv("negative_windows.csv", index=False)