In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')



In [None]:
# Charger les données du notebook précédent
with open('data/train_test_split.pkl', 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']

results = {}  

In [8]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

results = {}

In [9]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
nb_pred_proba = nb_model.predict_proba(X_test_tfidf)[:, 1]

nb_accuracy = accuracy_score(y_test, nb_pred)
nb_auc = roc_auc_score(y_test, nb_pred_proba)

results['Naive Bayes'] = {'accuracy': nb_accuracy, 'auc': nb_auc, 'pred': nb_pred, 'proba': nb_pred_proba}
print(f"Naive Bayes - Accuracy: {nb_accuracy:.4f}, AUC: {nb_auc:.4f}")

Naive Bayes - Accuracy: 0.9767, AUC: 0.9876


In [10]:
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)
svm_pred_proba = svm_model.predict_proba(X_test_tfidf)[:, 1]

svm_accuracy = accuracy_score(y_test, svm_pred)
svm_auc = roc_auc_score(y_test, svm_pred_proba)

results['SVM'] = {'accuracy': svm_accuracy, 'auc': svm_auc, 'pred': svm_pred, 'proba': svm_pred_proba}
print(f"SVM - Accuracy: {svm_accuracy:.4f}, AUC: {svm_auc:.4f}")

SVM - Accuracy: 0.9830, AUC: 0.9867


In [11]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)
lr_pred_proba = lr_model.predict_proba(X_test_tfidf)[:, 1]

lr_accuracy = accuracy_score(y_test, lr_pred)
lr_auc = roc_auc_score(y_test, lr_pred_proba)

results['Logistic Regression'] = {'accuracy': lr_accuracy, 'auc': lr_auc, 'pred': lr_pred, 'proba': lr_pred_proba}
print(f"Logistic Regression - Accuracy: {lr_accuracy:.4f}, AUC: {lr_auc:.4f}")

Logistic Regression - Accuracy: 0.9758, AUC: 0.9850


In [None]:
with open('data/classical_results.pkl', 'wb') as f:
    pickle.dump({
        'results': results,
        'tfidf': tfidf,
        'nb_model': nb_model,
        'svm_model': svm_model,
        'lr_model': lr_model
    }, f)
print(" Classical models saved to data/classical_results.pkl")

 Classical models saved to data/classical_results.pkl
