# Meilleur essai avec transfo en pickle du modèle !

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
import numpy as np
import os

# Chargement des données
X_train = pd.read_csv("data/X_train_update.csv", index_col=0)
Y_train = pd.read_csv("data/Y_train_CVw08PX.csv", index_col=0)
X_test = pd.read_csv("data/X_test_update.csv", index_col=0)

# Fusion des colonnes textuelles
X_train['text'] = X_train['designation'].fillna('') + " " + X_train['description'].fillna('')
X_test['text'] = X_test['designation'].fillna('') + " " + X_test['description'].fillna('')

# ======= SYNCHRONISATION DES INDICES =======
# Charger les indices test_split si disponibles
test_split_file = 'data/processed_data/test_split_indices.npz'
try:
    test_split_indices = np.load(test_split_file, allow_pickle=True)['arr_0']
    print(f"Indices test_split chargés: {len(test_split_indices)} exemples")
    
    # Vérifier que les indices sont dans X_train
    valid_indices = [idx for idx in test_split_indices if idx in X_train.index]
    if len(valid_indices) < len(test_split_indices):
        print(f"Attention: seulement {len(valid_indices)}/{len(test_split_indices)} indices sont valides")
    
    # Créer le split train/test en utilisant ces indices
    test_mask = X_train.index.isin(valid_indices)
    X_train_split = X_train.loc[~test_mask, 'text']
    X_val_split = X_train.loc[test_mask, 'text']
    y_train_split = Y_train.loc[~test_mask].values.ravel()
    y_val_split = Y_train.loc[test_mask].values.ravel()
    
    print(f"Division synchronisée: {len(X_train_split)} exemples d'entraînement, {len(X_val_split)} exemples de test")
    
except (FileNotFoundError, KeyError):
    print("Fichier d'indices test_split non trouvé, utilisation de train_test_split standard")
    # Séparer 20% des données pour validation (même ratio que dans le pipeline image)
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train['text'], Y_train.values.ravel(), test_size=0.2, random_state=42
    )
    # Sauvegarder ces indices pour synchronisation future
    import os
    test_indices = X_train.index[X_train['text'].isin(X_val_split)].values
    os.makedirs(os.path.dirname(test_split_file), exist_ok=True)
    np.savez(test_split_file, arr_0=test_indices)
    print(f"Nouveaux indices test_split sauvegardés: {len(test_indices)} exemples")

# Définir le pipeline SVM
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=45000)),
    ('model', SVC(C=12, kernel='rbf', gamma='scale', probability=True, class_weight='balanced'))
])

# Entraîner le pipeline
print("Starting training of the SVM model...")
pipeline_svm.fit(X_train_split, y_train_split)
print("Training completed!")

# Évaluer le modèle
y_pred_val = pipeline_svm.predict(X_val_split)
print("Evaluation of the model on validation set:")
print(classification_report(y_val_split, y_pred_val, zero_division=0))
print(f"Accuracy on validation set: {accuracy_score(y_val_split, y_pred_val)}")

# Sauvegarde du modèle
print("Saving the trained model...")
os.makedirs('data/models/SVM', exist_ok=True)
joblib.dump(pipeline_svm, "data/models/SVM/model.pkl")
print("Model saved as 'data/models/SVM/model.pkl'.")

Indices test_split chargés: 16984 exemples
Division synchronisée: 67932 exemples d'entraînement, 16984 exemples de test
Starting training of the SVM model...
Training completed!
Evaluation of the model on validation set:
              precision    recall  f1-score   support

          10       0.47      0.67      0.55       623
          40       0.75      0.69      0.72       502
          50       0.82      0.85      0.83       336
          60       0.99      0.80      0.88       166
        1140       0.80      0.83      0.81       534
        1160       0.96      0.95      0.95       791
        1180       0.86      0.52      0.64       153
        1280       0.70      0.71      0.70       974
        1281       0.65      0.56      0.60       414
        1300       0.94      0.93      0.94      1009
        1301       0.97      0.94      0.96       161
        1302       0.87      0.74      0.80       498
        1320       0.85      0.83      0.84       648
        1560       0.8