In [7]:
import sys
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression


project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

from preprocessing import *

# Charge directement le CSV
df = pd.read_csv(project_root / "data" / "processed" / "quantum_states_10000.csv")
print(f"Dataset: {df.shape}")

# Test SANS norm_squared (vrai challenge)
X, y = prepare_features_and_target(df, include_norm_squared=False)
print(f"Features: {X.shape}")

# Continue avec le reste...
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)
print("\n Split effectué!")

from sklearn.linear_model import LogisticRegression

def create_logistic_regression(C=1.0, solver='lbfgs', max_iter=1000):
    """
    Instancie un modèle de régression logistique.
    
    Args:
        C (float): Inverse de la force de régularisation. Plus C est petit, plus la régularisation est forte.
        solver (str): L'algorithme d'optimisation (ex: 'lbfgs', 'liblinear').
        max_iter (int): Nombre maximum d'itérations pour la convergence du solveur.
    
    Returns:
        model: L'instance du modèle non entraîné.
    """
    model = LogisticRegression(
        C=C, 
        solver=solver, 
        max_iter=max_iter, 
        random_state=42 # Toujours fixer le seed pour la reproductibilité (rigueur scientifique)
    )
    return model

Dataset: (10000, 11)
Features: (10000, 8)
Split effectué:
  Train: 6000 (60.0%)
  Val:   2000 (20.0%)
  Test:  2000 (20.0%)

 Split effectué!


In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

print(f"Train: {len(X_train)}")
print(f"Val: {len(X_val)}")
print(f"Test: {len(X_test)}")

# Vérification de la stratification
print(f"\nClasse 1 dans train: {(y_train==1).sum()/len(y_train):.1%}")
print(f"Classe 1 dans val: {(y_val==1).sum()/len(y_val):.1%}")
print(f"Classe 1 dans test: {(y_test==1).sum()/len(y_test):.1%}")

Split effectué:
  Train: 6000 (60.0%)
  Val:   2000 (20.0%)
  Test:  2000 (20.0%)
Train: 6000
Val: 2000
Test: 2000

Classe 1 dans train: 50.0%
Classe 1 dans val: 50.0%
Classe 1 dans test: 50.0%


In [8]:
# Modèle baseline
model = create_logistic_regression(C=1.0)
model, metrics = train_model(model, X_train_sc, y_train, X_val_sc, y_val)

print(f"\n Performance finale: {metrics['accuracy']:.2%}")

NameError: name 'train_model' is not defined

In [None]:
from sklearn.metrics import accuracy_score

y_test_pred = model.predict(X_test_sc)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Accuracy sur TEST: {test_accuracy:.4f}")
print(" Projet terminé!")

# Sauvegarde
save_model(model, "models/baseline_model.joblib")