## Dans le code de cette étape on sépare le code qu'on soumet à la compétition de celui qu'on utilise pour les tests locaux. Commençons par le code qu'on a soumet à la compétition.

# Chargement des données

In [0]:
import numpy as np
import pandas as pd

X_train = np.load('/kaggle/input/classer-le-text/data_train.npy')
X_test = np.load('/kaggle/input/classer-le-text/data_test.npy')
df = pd.read_csv('/kaggle/input/classer-le-text/label_train.csv')
y_train = df['label'].to_numpy()

# Mutlinomial Naive Bayes de base

In [ ]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions3.csv', index=False)

# Multinomial Naive Bayes avec TF-IDF

In [ ]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

# Application de TF-IDF
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions4.csv', index=False)

# Multinomial Naive Bayes avec l'application de SMOTE

In [ ]:
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE

# Application de SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

clf = MultinomialNB()
clf.fit(X_train_balanced, y_train_balanced)

y_pred = clf.predict(X_test)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions5.csv', index=False)

# SVM linéaire de base

In [ ]:
from sklearn.svm import LinearSVC

clf = LinearSVC(
    random_state=42,
    max_iter=10000
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions6.csv', index=False)

# SVM linéaire de base avec C

In [ ]:
from sklearn.svm import LinearSVC

clf = LinearSVC(
    random_state=42,
    max_iter=10000,
    C=1.0
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions7.csv', index=False)

# Random Forest de base

In [ ]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=200,       
    max_depth=10,            
    min_samples_split=10,    
    min_samples_leaf=5,      
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('predictions8.csv', index=False)

# Multinomial NB avec sélection de features par Random Forest

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': range(len(rf.feature_importances_)),
    'importance': rf.feature_importances_
})

# Sélection des top features 
n_top_features = 1000
top_features = feature_importance.nlargest(n_top_features, 'importance')['feature'].values

# Sélection des features pour X_train et X_test
X_train_selected = X_train[:, top_features]
X_test_selected = X_test[:, top_features]

# Entraînement de MultinomialNB sur les features sélectionnées
nb = MultinomialNB()

nb.fit(X_train_selected, y_train)


y_pred = nb.predict(X_test_selected)

predictions = pd.DataFrame({
    'ID': range(len(y_pred)),
    'label': y_pred
})

predictions.to_csv('prediction6.csv', index=False)

## Maintenant on va mette le code pour tester les modèles localement. Les tests sont fait en utilisant validation croisée.

# Multinomial Naive Bayes de base

In [ ]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

clf = MultinomialNB()

# Évaluation avec validation croisée
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro')

# Affichage des résultats
print("Résultats MultinomialNB:")
print(f"Score F1 macro moyen (CV): {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Multinomial Naive Bayes avec différents alpha

In [ ]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Application de TF-IDF
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Test de différentes valeurs d'alpha
alpha_values = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0]
best_score = 0
best_alpha = 0

for alpha in alpha_values:
    clf = MultinomialNB(alpha=alpha)
    # Calcul du score moyen avec validation croisée
    scores = cross_val_score(clf, X_train_tfidf, y_train, cv=5, scoring='f1_macro')
    mean_score = scores.mean()
    print(f"Alpha = {alpha}: score moyen = {mean_score:.3f}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_alpha = alpha

print(f"\nMeilleur alpha: {best_alpha} (score: {best_score:.3f})")

# SVM linéaire de base

In [ ]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

clf = LinearSVC(random_state=42)

# Évaluation avec validation croisée
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro')
print("Scores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# SVM linéaire optimisé avec C=0.1 et class_weight='balanced'

In [ ]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

clf = LinearSVC(
    max_iter=2000,        
    C=0.1,             
    class_weight='balanced', 
    random_state=42,
    dual=False         
)

# Évaluation avec validation croisée
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro')
print("Scores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Random Forest de base

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=200,       
    random_state=42,
)
# Évaluation avec validation croisée
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro')
print("Scores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Random Forest optimisé

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=200,        # Plus d'arbres
    max_depth=10,            # Limiter la profondeur
    min_samples_split=10,    # Plus d'échantillons par split
    min_samples_leaf=5,      # Plus d'échantillons par feuille
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
# Évaluation avec validation croisée
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro')
print("Scores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Combinaison de MultinomialNB et RandomForest

In [ ]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# Création des classifieurs individuels
nb = MultinomialNB()
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Création de l'ensemble
ensemble = VotingClassifier(
    estimators=[
        ('nb', nb),
        ('rf', rf)
    ],
    voting='soft'  
)

# Évaluation avec validation croisée
scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='f1_macro')
print("Scores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Entraînement sur tout le jeu de données
ensemble.fit(X_train, y_train)

# Matrice de confusion sur l'ensemble d'entraînement
y_pred_train = ensemble.predict(X_train)
print("\nMatrice de confusion sur l'ensemble d'entraînement:")
print(confusion_matrix(y_train, y_pred_train))

# Selection de features basée sur RandomForest pour MultinomialNB

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# Étape 1: Entraîner Random Forest pour obtenir les importances
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Création du DataFrame des importances
feature_importance = pd.DataFrame({
    'feature': range(len(rf.feature_importances_)),
    'importance': rf.feature_importances_
})

# Sélection des top features
n_top_features = 1000 
top_features = feature_importance.nlargest(n_top_features, 'importance')['feature'].values

# Sélection des features pour X_train et X_test
X_train_selected = X_train[:, top_features]
X_test_selected = X_test[:, top_features]

print(f"Nombre de features original: {X_train.shape[1]}")
print(f"Nombre de features sélectionnées: {len(top_features)}")

# Entraînement de MultinomialNB sur les features sélectionnées
nb = MultinomialNB()

# Évaluation avec validation croisée
scores = cross_val_score(nb, X_train_selected, y_train, cv=5, scoring='f1_macro')
print("\nScores de validation croisée:", scores)
print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Entraînement sur tout le jeu de données
nb.fit(X_train_selected, y_train)

# Matrice de confusion sur l'ensemble d'entraînement
y_pred_train = nb.predict(X_train_selected)
print("\nMatrice de confusion sur l'ensemble d'entraînement:")
print(confusion_matrix(y_train, y_pred_train))

# Tests différents nombre de features

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# Obtenir les importances des features avec Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': range(len(rf.feature_importances_)),
    'importance': rf.feature_importances_
})

# Tester différents nombres de features
n_features_list = [500, 1000, 2000, 3000]

results = {}
for n_features in n_features_list:
    print(f"\nTest avec {n_features} features:")
    
    # Sélection des features
    top_features = feature_importance.nlargest(n_features, 'importance')['feature'].values
    X_train_selected = X_train[:, top_features]
    X_test_selected = X_test[:, top_features]
    
    # Évaluation avec MultinomialNB
    nb = MultinomialNB()
    scores = cross_val_score(nb, X_train_selected, y_train, cv=5, scoring='f1_macro')
    
    print(f"Scores de validation croisée: {scores}")
    print(f"Score moyen: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    
    # Entraînement et matrice de confusion
    nb.fit(X_train_selected, y_train)
    y_pred_train = nb.predict(X_train_selected)
    print("\nMatrice de confusion:")
    print(confusion_matrix(y_train, y_pred_train))
    
    results[n_features] = {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'features': top_features
    }

best_n_features = max(results.items(), key=lambda x: x[1]['mean_score'])[0]
print(f"\nMeilleur nombre de features: {best_n_features}")
print(f"Score: {results[best_n_features]['mean_score']:.3f}")