In [1]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 2.8/2.8 MB 18.1 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [7]:
# protein_ml_example.ipynb

import os
import numpy as np
import pandas as pd

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis  # for basic protein stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')  # to keep output clean (optional)

# Paths to your FASTA files
enzyme_file = r"enzyme_proteins.fasta"
non_enzyme_file = r"non_enzyme_proteins.fasta"

def load_fasta_as_df(fasta_path, label):
    """
    Parse a FASTA file and return a DataFrame with columns:
      - seq_id
      - description (FASTA header)
      - sequence
      - label
    """
    records = list(SeqIO.parse(fasta_path, "fasta"))
    data = []
    for rec in records:
        data.append({
            "seq_id": rec.id,
            "description": rec.description,
            "sequence": str(rec.seq),
            "label": label
        })
    return pd.DataFrame(data)

# Load both classes
df_enzyme = load_fasta_as_df(enzyme_file, label=1)
df_non_enzyme = load_fasta_as_df(non_enzyme_file, label=0)

# Concatenate into one DataFrame
df = pd.concat([df_enzyme, df_non_enzyme], ignore_index=True)

# Shuffle the DataFrame rows
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

df.head()



Unnamed: 0,seq_id,description,sequence,label
0,FakeEnzyme1,FakeEnzyme1,MKTLLITALLAVALATECQTDRVLDA,1
1,FakeEnzyme2,FakeEnzyme2,GGHHGRILTVAGILVLLFLLNAAYALTAC,1
2,FakeNonEnzyme3,FakeNonEnzyme3,MGAALTKLFLIISAIANEVASVSFNYGHT,0
3,FakeEnzyme3,FakeEnzyme3,MKIGFATNLLSAQADAIVEEILASVINSGLLA,1
4,FakeNonEnzyme2,FakeNonEnzyme2,GGTADMVLELAHIIELVDQRIIVDDNIT,0


In [13]:
def extract_features(sequence):
    """
    Given a protein sequence (string), return a dictionary of features.
    """
    analyzed_seq = ProteinAnalysis(sequence)
    
    # Sequence length
    seq_len = len(sequence)
    
    # AA composition returns a dict of counts for each residue
    aa_count = analyzed_seq.count_amino_acids()
    # Convert to fraction (count / seq_len) for each residue
    aa_fraction = {k: v/seq_len for k,v in aa_count.items()}
    
    # Basic stats from ProteinAnalysis
    iso_point = analyzed_seq.isoelectric_point()
    aromaticity = analyzed_seq.aromaticity()
    instability = analyzed_seq.instability_index()
    gravy = analyzed_seq.gravy()  # grand average of hydropathicity
    ss_fraction = analyzed_seq.secondary_structure_fraction()  # tuple (helix, turn, sheet)
    
    # Build feature dict
    features = {
        "seq_length": seq_len,
        "iso_point": iso_point,
        "aromaticity": aromaticity,
        "instability": instability,
        "gravy": gravy,
        "helix_fraction": ss_fraction[0],
        "turn_fraction": ss_fraction[1],
        "sheet_fraction": ss_fraction[2],
    }
    
    # You could also merge the AA fractions if you want each residue as a feature
    # For demonstration, let's include the fraction for a few key residues (just as examples)
    for residue in ["A", "C", "D", "E", "K", "R"]:
        features[f"frac_{residue}"] = aa_fraction.get(residue, 0.0)
    
    return features

In [15]:
feature_dicts = []
for seq in df["sequence"]:
    feature_dicts.append(extract_features(seq))

# Convert list of dicts to a DataFrame
feature_df = pd.DataFrame(feature_dicts)

# Merge with original labels
feature_df["label"] = df["label"]

print(feature_df.shape)
feature_df.head()

(6, 15)


Unnamed: 0,seq_length,iso_point,aromaticity,instability,gravy,helix_fraction,turn_fraction,sheet_fraction,frac_A,frac_C,frac_D,frac_E,frac_K,frac_R,label
0,26,4.556577,0.0,28.123077,0.919231,0.538462,0.076923,0.5,0.192308,0.038462,0.076923,0.038462,0.038462,0.038462,1
1,29,8.239398,0.068966,44.006897,1.365517,0.413793,0.172414,0.517241,0.172414,0.034483,0.0,0.0,0.0,0.034483,1
2,29,6.50121,0.103448,-1.727586,0.906897,0.37931,0.241379,0.448276,0.172414,0.0,0.0,0.034483,0.034483,0.0,0
3,32,4.136935,0.03125,19.25625,1.003125,0.46875,0.25,0.40625,0.1875,0.0,0.03125,0.0625,0.03125,0.0,1
4,28,4.050028,0.0,38.264286,0.503571,0.285714,0.25,0.464286,0.071429,0.0,0.142857,0.071429,0.0,0.035714,0


In [17]:
# Separate features and labels
X = feature_df.drop("label", axis=1)
y = feature_df["label"]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.3f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.500
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

