In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# === 1. Chargement des données ===
df_main = pd.read_excel("C:\\Users\\HP\\Downloads\\CAPAS.xlsx")  # colonnes : description, level1, level2
df_hierarchy = pd.read_excel("C:\\Users\\HP\\Downloads\\extract.xlsx")  # colonnes : level1, level2

# Garder les colonnes nécessaires
df_main = df_main[["Finding Description", "Nature of finding (Level 1)", "Nature of finding (Level 2)"]]
df_main.columns = ["description", "level1", "level2"]
df_main.dropna(inplace=True)

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
# === 2. Nettoyage du texte ===
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text)  # espaces multiples
    text = text.translate(str.maketrans('', '', string.punctuation))  # retirer ponctuation
    return text.strip()

df_main["description"] = df_main["description"].apply(clean_text)

In [4]:
# === 3. Encodage des labels ===
le_level1 = LabelEncoder()
df_main["level1_encoded"] = le_level1.fit_transform(df_main["level1"])

le_level2 = LabelEncoder()
df_main["level2_encoded"] = le_level2.fit_transform(df_main["level2"])

In [5]:
import joblib
joblib.dump(le_level1, "le_level1.joblib")
joblib.dump(le_level2, "le_level2.joblib")


['le_level2.joblib']

In [6]:
# === 4. Split Train/Test ===
X_train_texts, X_test_texts, y_train_l1, y_test_l1, y_train_l2, y_test_l2 = train_test_split(
    df_main["description"],
    df_main["level1_encoded"],
    df_main["level2_encoded"],
    test_size=0.2,
    stratify=df_main["level1_encoded"],
    random_state=42
)

In [7]:
# === 5. TF-IDF Vectorisation ===
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train_texts)
X_test_vect = vectorizer.transform(X_test_texts)

In [8]:
# === 6. Rééquilibrage pour Level1 ===
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_l1_resampled = ros.fit_resample(X_train_vect, y_train_l1)

In [9]:
# === 7. Modèle Level1 avec XGBoost ===
clf_l1 = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
clf_l1.fit(X_train_resampled, y_train_l1_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
# === 8. Dictionnaire hiérarchique ===
level1_to_level2 = df_hierarchy.groupby("By nature of issue (Level 1)")["By Nature (Level 2)"].apply(list).to_dict()

In [11]:
models_level2 = {}
le_level2_per_l1 = {}

for level1_label in df_main["level1"].unique():
    subset = df_main[df_main["level1"] == level1_label]
    
    X_l2 = vectorizer.transform(subset["description"])
    y_l2_text = subset["level2"]
    
    le_local = LabelEncoder()
    y_l2_local = le_local.fit_transform(y_l2_text)
    
    X_res, y_res = RandomOverSampler(random_state=42).fit_resample(X_l2, y_l2_local)
    
    clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    clf.fit(X_res, y_res)
    
    models_level2[level1_label] = clf
    le_level2_per_l1[level1_label] = le_local


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [12]:
def predict_hierarchical_fast(desc):
    x_input = vectorizer.transform([desc])

    # prédiction level1
    l1_pred_enc = clf_l1.predict(x_input)[0]
    l1_label = le_level1.inverse_transform([l1_pred_enc])[0]

    # prédiction level2 avec modèle préentraîné
    if l1_label not in models_level2:
        return l1_pred_enc, None

    clf_l2 = models_level2[l1_label]
    le_local = le_level2_per_l1[l1_label]

    l2_pred_local = clf_l2.predict(x_input)[0]
    l2_label_text = le_local.inverse_transform([l2_pred_local])[0]

    # retransformer vers l'encodage global
    l2_pred_global = le_level2.transform([l2_label_text])[0]

    return l1_pred_enc, l2_pred_global


In [13]:
l1_preds, l2_preds = [], []

for desc in X_test_texts:
    l1, l2 = predict_hierarchical_fast(desc)
    l1_preds.append(l1)
    l2_preds.append(l2 if l2 is not None else -1)

# Accuracy
acc_l1 = accuracy_score(y_test_l1, l1_preds)
mask_valid = np.array([p != -1 for p in l2_preds])
acc_l2 = accuracy_score(y_test_l2[mask_valid], np.array(l2_preds)[mask_valid])

print("Accuracy Level1:", round(acc_l1 * 100, 2), "%")
print("Accuracy Level2:", round(acc_l2 * 100, 2), "%")


Accuracy Level1: 72.03 %
Accuracy Level2: 69.53 %


In [14]:
def predict_description(desc):
    # Nettoyer la description comme à l'entraînement
    desc_cleaned = clean_text(desc)

    # Prédiction hiérarchique
    l1_enc, l2_enc = predict_hierarchical_fast(desc_cleaned)

    # Tenter l'inverse_transform pour Level1
    try:
        l1_label = le_level1.inverse_transform([l1_enc])[0]
    except ValueError:
        l1_label = f"Label inconnu (enc: {l1_enc})"

    # Tenter l'inverse_transform pour Level2
    if l2_enc == -1 or l2_enc is None:
        l2_label = "Pas de sous-catégorie disponible"
    else:
        try:
            l2_label = le_level2.inverse_transform([l2_enc])[0]
        except ValueError:
            l2_label = f"Label inconnu (enc: {l2_enc})"

    # Affichage console
    print("Description :", desc)
    print("Level1 (catégorie) :", l1_label)
    print("Level2 (sous-catégorie) :", l2_label)



In [15]:
print("\n Rapport Level1")
print(classification_report(y_test_l1, l1_preds, target_names=le_level1.classes_))

print("\n Rapport Level2")
used_labels = np.unique(y_test_l2[mask_valid])
print(classification_report(
    y_test_l2[mask_valid],
    np.array(l2_preds)[mask_valid],
    labels=used_labels,
    target_names=le_level2.inverse_transform(used_labels)
))



 Rapport Level1
                     precision    recall  f1-score   support

  Change Management       0.51      0.56      0.54        32
       Construction       0.50      0.48      0.49        64
Document Management       0.54      0.66      0.60        94
        Engineering       0.83      0.84      0.84       590
           Estimate       0.80      0.75      0.77        16
   Project Controls       0.65      0.62      0.63        81
 Project Governance       0.26      0.24      0.25        21
 Project Management       0.66      0.61      0.64       331
                 Qa       0.63      0.68      0.65        47
    Risk Assessment       0.83      0.84      0.84       101
              Sales       0.50      0.11      0.18         9
       Supply Chain       0.61      0.53      0.57        43
                hse       0.76      0.85      0.80        87

           accuracy                           0.72      1516
          macro avg       0.62      0.60      0.60      1516
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
import joblib
import os

# Créer un dossier pour stocker les modèles
os.makedirs("models", exist_ok=True)

# Sauvegarder vectorizer et modèle level1
joblib.dump(vectorizer, "models/vectorizer.joblib")
joblib.dump(clf_l1, "models/xgb_level1.joblib")

# Sauvegarder les encoders
joblib.dump(le_level1, "models/label_encoder_level1.joblib")
joblib.dump(le_level2, "models/label_encoder_level2.joblib")

# Sauvegarder les modèles de level2
joblib.dump(models_level2, "models/models_level2_dict.joblib")
joblib.dump(le_level2_per_l1, "models/le_level2_per_l1_dict.joblib")


['models/le_level2_per_l1_dict.joblib']