In [11]:
import pandas as pd
import model_rf_base
import os, sys
sys.path.append(os.path.abspath("../../etc/"))
import config

In [12]:
df = pd.read_csv("./data/standardized_set.csv")

In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from enum import Enum
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

class rf_antibody:
    def __init__(self, target: str, df: pd.DataFrame) -> None:
        # Define what to exclude once. 
        # Add any other string columns here (e.g., 'CDRH3', 'VHorVHH')
        self.target = target
        self.exclude = [f'is_binding_{target}', f'is_neutral_{target}', 'name', 'CDRH3', 'CDRL3']
        self.df = df
        self.rf_classifier_neutral = None
        self.rf_classifier_binding = None

    def get_x_y(self, label: str):
        # 1. Filter out unknowns
        bool_not_unknown = self.df[label] != 2
        filtered_df = self.df[bool_not_unknown]
        
        # 2. Separate Features (X) and Target (y)
        # errors='ignore' prevents crashes if a column was already dropped
        X = filtered_df.drop(columns=self.exclude, errors='ignore')
        y = filtered_df[label]
        
        # 3. CRITICAL: Select only numeric features
        # This removes any lingering sequences or IDs that would break Scikit-Learn
        X = X.select_dtypes(include=[np.number])
        
        return X, y

    def train_binding(self):
        label = f'is_binding_{self.target}'
        X, y = self.get_x_y(label)
        
        # No need to drop 'name' here; get_x_y already handled it!
        self.rf_classifier_binding, report = self.train(X, y)
        return self.rf_classifier_binding, report

    def train(self, X:pd.DataFrame, y: pd.Series):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        rf_antibody = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        rf_antibody.fit(X_train, y_train)
        y_pred = rf_antibody.predict(X_test)
        return rf_antibody, classification_report(y_test, y_pred)
        
        
    def train_neutral(self):
        label = f'is_neutral_{self.target}'
        X, y = self.get_x_y(label)
        self.rf_classifier_neutral, report = self.train(X, y)
        return self.rf_classifier_neutral, report
    
        
        

In [17]:
model = rf_antibody(config.TARGET, df)

In [18]:
_, report = model.train_binding()

In [19]:
print(report)

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1126
           1       0.96      1.00      0.98      2254

    accuracy                           0.97      3380
   macro avg       0.98      0.96      0.97      3380
weighted avg       0.97      0.97      0.97      3380



In [20]:
_, report = model.train_neutral()

In [21]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1612
           1       0.84      0.73      0.78      1113

    accuracy                           0.83      2725
   macro avg       0.83      0.82      0.82      2725
weighted avg       0.83      0.83      0.83      2725

