In [None]:
#!pip install -r requirements2.txt

In [2]:
# ============================================
# 0. Imports & configuration
# ============================================
import pandas as pd
import numpy as np

from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    roc_curve,
    confusion_matrix,
    classification_report
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
import seaborn as sns

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [3]:
# ============================================
# 1. Chargement et fusion des deux CSV
# ============================================

# À adapter à tes chemins réels
POS_PATH = Path("./../src/data/positive_labeled_datasets.csv")
NEG_PATH = Path("./../src/data/negative_labeled_datasets.csv")

df_pos = pd.read_csv(POS_PATH)
df_neg = pd.read_csv(NEG_PATH)

print("Positives shape:", df_pos.shape)
print("Negatives shape:", df_neg.shape)

df_pos["label_interacting"] = 1
df_neg["label_interacting"] = 0

# Fusion + shuffle
df = pd.concat([df_pos, df_neg], ignore_index=True)
df = df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

print("Dataset global :", df.shape)
df.head()


Positives shape: (195, 8)
Negatives shape: (200, 8)
Dataset global : (395, 8)


Unnamed: 0,id,ipsae,pdockq2,prodigy_kd,ipTM+pTM,pTM,REF2015,label_interacting
0,3vba__A1_Q58673_and_3vba__B1_Q58673,0.597496,0.5587,0.0001,0.793111,0.775327,1029.927746,1
1,3B5EA_and_3B7HA,0.0,0.0,3.5e-10,0.282176,0.174913,24323.078983,0
2,1Q9UA_and_1LL2A,0.0,0.0,2.6e-06,0.326672,0.232861,4497.103044,0
3,3g7p__A1_B7JA91_and_3g7p__A2_B7JA91,0.814622,0.6361,3.0,0.895859,0.891898,138.955457,1
4,3F75A_and_3FH2A,0.437555,0.409,4.4e-06,0.749802,0.720932,220.085918,0


In [4]:
# ============================================
# 2. Préparation du dataset ML (X, y) + splits
# ============================================

# Colonnes de features (à adapter si tu en ajoutes)
FEATURE_COLS = ["ipsae", "pdockq2", "prodigy_kd", "ipTM+pTM", "pTM", "REF2015"]
TARGET_COL = "label_interacting"

# On enlève les lignes avec des NaN dans les features ou la target
df_ml = df.dropna(subset=FEATURE_COLS + [TARGET_COL]).copy()

X = df_ml[FEATURE_COLS].values
y = df_ml[TARGET_COL].values

print("Taille finale pour le ML :", X.shape)

# Split train / temp (validation+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.4,           # 60% train, 40% temp
    stratify=y,
    random_state=RANDOM_STATE
)

# Split temp en validation et test (20% / 20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,           # 50% de 40% -> 20% total
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print("Train :", X_train.shape, "Val :", X_val.shape, "Test :", X_test.shape)


Taille finale pour le ML : (391, 6)
Train : (234, 6) Val : (78, 6) Test : (79, 6)


In [6]:
# ============================================
# 3. Baseline : XGBoost Classifier
# ============================================
# XGBoost est un bon point de départ : non linéaire, gère bien les petits datasets,
# pas besoin de scaler les features.

xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

xgb_clf.fit(X_train, y_train)

# Prédictions sur validation et test
y_val_proba = xgb_clf.predict_proba(X_val)[:, 1]
y_test_proba = xgb_clf.predict_proba(X_test)[:, 1]

y_val_pred = (y_val_proba >= 0.5).astype(int)
y_test_pred = (y_test_proba >= 0.5).astype(int)

print("=== Validation set ===")
print("ROC AUC :", roc_auc_score(y_val, y_val_proba))
print("Average Precision :", average_precision_score(y_val, y_val_proba))
print(classification_report(y_val, y_val_pred))

print("\n=== Test set ===")
print("ROC AUC :", roc_auc_score(y_test, y_test_proba))
print("Average Precision :", average_precision_score(y_test, y_test_proba))
print(classification_report(y_test, y_test_pred))


ValueError: could not convert string to float: '0,0000007'