<h1>Mod√©lisation : Pr√©diction des tags algorithmiques</h1>
<h2>Approches Machine Learning </h2>

<hr>

<h2> Objectif de cette section</h2>

<p>
L‚Äôobjectif de cette section est de concevoir, entra√Æner et comparer plusieurs mod√®les
pour la t√¢che de <strong>classification multi-labels</strong> des exercices d‚Äôalgorithmique.
</p>

<p>
Nous explorons successivement :
</p>

<ul>
  <li>des mod√®les classiques de Machine Learning bas√©s sur le texte,</li>
  <li>des approches hybrides combinant texte et features issues du code source,</li>
</ul>


<h2>Sommaire</h2>

<ol>
  <li><a href="#1-preparation-des-donnees">Pr√©paration des donn√©es</a></li>
  <li><a href="#2-One-hot-encoding-et-data-splitting">One-hot-encoding et data splitting</a></li>
  <li><a href="#3-vectorisation-tfidf">Vectorisation TF-IDF</a></li>
  <li><a href="#4-modeles-machine-learning">Mod√®les Machine Learning classiques</a></li>
  <li><a href="#5-optimisation-des-hyperparametres">Optimisation des hyperparam√®tres</a></li>
  <li><a href="#6-approche-hybride-texte-code">Approche hybride : texte + code source</a></li>
  <li><a href="#7-comparaison-des-performances">Comparaison des performances</a></li>
</ol>

<hr>


### Importations

In [None]:
!pip install contractions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss, accuracy_score, classification_report,confusion_matrix
from sklearn.svm import LinearSVC, SVC
from google.colab import drive
import zipfile
import os
import json
from glob import glob
import re
import nltk
import contractions
from tqdm.auto import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from scipy.sparse import hstack, csr_matrix
from pathlib import Path
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import random


In [None]:
# -----------------------------
# Fixer la seed globale
# -----------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

### 1. Pr√©paration des donn√©es

<p>
Les donn√©es sont charg√©es √† partir de fichiers JSON.
Chaque exercice est associ√© √† :
</p>

<ul>
  <li>une description textuelle nettoy√©e,</li>
  <li>le code source en Python,</li>
  <li>une liste de tags (multi-label).</li>
</ul>


In [None]:
# Monter Google Drive
drive.mount('/content/drive')

In [None]:
# Chemins (√† adapter si besoin)
ZIP_PATH = "/content/drive/MyDrive/code_classification_dataset.zip"
DATASET_DIR = "/content/data"

# Cr√©er le dossier de destination s'il n'existe pas
os.makedirs(DATASET_DIR, exist_ok=True)

# Extraire le zip
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
    zip_ref.extractall(DATASET_DIR)

print("Fichiers extraits :", len(os.listdir("/content/data/code_classification_dataset")))


In [None]:
TEXT_FIELDS = [
    "prob_desc_description",
    "prob_desc_input_spec",
    "prob_desc_output_spec",
    "prob_desc_notes",
    "source_code",
    "tags",
]

def load_dataset(dataset_dir):
    records = []
    json_files = glob(os.path.join(dataset_dir, "*.json"))
    print(f"{len(json_files)} fichiers JSON trouv√©s")

    for path in json_files:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Cr√©er un nouveau dictionnaire pour chaque fichier JSON
        record = {}
        for field in TEXT_FIELDS:
            record[field] = data.get(field, "") or ""

        # S'assurer que 'tags' est une liste
        if not isinstance(record["tags"], list):
            record["tags"] = []

        records.append(record)

    df = pd.DataFrame(records)

    # Nettoyage suppl√©mentaire
    for field in TEXT_FIELDS:
        df[field] = df[field].fillna("")

    # Forcer l'ordre des colonnes
    df = df[TEXT_FIELDS]

    return df


In [None]:
df = load_dataset("/content/data/code_classification_dataset")

In [None]:
df.describe()

In [None]:
TARGET_TAGS = [
    'math', 'graphs', 'strings', 'number theory',
    'trees', 'geometry', 'games', 'probabilities'
]

def filter_target_tags(tags):
    return [t for t in tags if t in TARGET_TAGS]

df["tags"] = df["tags"].apply(filter_target_tags)

df = df[df["tags"].map(len) > 0]


In [None]:
df.describe()

In [None]:
df.head(2)

In [None]:
# Colonnes √† concat√©ner
cols_to_concat = [
    "prob_desc_description",
    "prob_desc_input_spec",
    "prob_desc_output_spec",
    "prob_desc_notes"
]

# Cr√©er une nouvelle colonne 'description' en concat√©nant les colonnes avec un espace
df['description'] = df[cols_to_concat].agg(' '.join, axis=1)


In [None]:
df["description"].iloc[0]

In [None]:
df.head(2)

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

class DataProcessor:

    def __init__(self):
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        # self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.stemmer = nltk.stem.SnowballStemmer("english")

    def __call__(self, text: str) -> str:

        text = text.lower() # Lowercase
        text = text.replace('-', ' ') # Replace hyphens with spaces
        text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # Remove special characters
        text = re.sub(r"\s+", " ", text) # Remove extra whitespaces
        text = re.sub(r"\d+", " ", text) # Remove digits
        text = contractions.fix(text) # Expand contractions, for example don't -> do not

        tokens = nltk.word_tokenize(text)
        tokens = [word for word in tokens if word not in self.stopwords]
        # tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
        tokens = [self.stemmer.stem(word) for word in tokens]

        return " ".join(tokens)


In [None]:
tqdm.pandas()

In [None]:
processor = DataProcessor()
df['description_clean'] = df['description'].progress_apply(processor)

In [None]:
df.head(2)

In [None]:
df.iloc[0]["description_clean"]

In [None]:
# df = pd.concat(
#     [
#         df[["description_clean", "source_code"]].reset_index(drop=True),
#         pd.DataFrame(
#             y,
#             columns=mlb.classes_
#         )
#     ],
#     axis=1
# )

### 2. One-hot-encoding et data splitting
<p>
Les tags sont binaris√©s √† l‚Äôaide d‚Äôun <code>MultiLabelBinarizer</code>,
et les donn√©es sont s√©par√©es en ensembles d‚Äôentra√Ænement et de validation (80/20).
</p>

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tags"])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["description_clean"].tolist(),
    y,
    test_size=0.2,
    random_state=42
)


### 3. *Vectorisation* TF-IDF

<p>
Le texte des descriptions est vectoris√© √† l‚Äôaide de la m√©thode <strong>TF-IDF</strong>,
avec les param√®tres suivants :
</p>

<ul>
  <li>maximum de 5000 features,</li>
  <li>n-grammes de taille 1 et 2.</li>
</ul>

<p>
Cette repr√©sentation permet de capturer √† la fois le vocabulaire sp√©cifique
et certaines expressions caract√©ristiques des probl√®mes algorithmiques.
</p>


In [None]:
# -----------------------------
# 1. TF-IDF
# -----------------------------
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

### 4. Mod√®les Machine Learning classiques

<p>
Plusieurs classificateurs de base sont √©valu√©s, chacun int√©gr√© dans diff√©rentes
strat√©gies multi-label :
</p>

<ul>
  <li><strong>One-vs-Rest (OvR)</strong></li>
  <li><strong>MultiOutputClassifier</strong></li>
  <li><strong>Classifier Chains</strong></li>
</ul>

<p>
Les classificateurs de base test√©s sont :
</p>

<ul>
  <li>Logistic Regression</li>
  <li>Random Forest</li>
  <li>Support Vector Machines (SVC, LinearSVC)</li>
</ul>

<p>
Les performances sont √©valu√©es √† l‚Äôaide de m√©triques adapt√©es au multi-label,
notamment le <strong>Micro F1-score</strong>, le <strong>Macro F1-score</strong>, le <strong>hamming_loss</strong> et le <strong>subset_accuracy</strong>.
</p>


In [None]:
# -----------------------------
# 2. D√©finir classificateurs de base
# -----------------------------

base_classifiers = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ),
    "SVC": SVC(
        kernel="linear",
        probability=True,
        class_weight="balanced"
    ),
    "LinearSVC": LinearSVC(
        class_weight="balanced"
    )
}

# -----------------------------
# 3. D√©finir mod√®les multi-label
# -----------------------------
models = {}
for name, clf in base_classifiers.items():
    models[f"OVR_{name}"] = OneVsRestClassifier(clf)
    models[f"MultiOutput_{name}"] = MultiOutputClassifier(clf)
    models[f"Chain_{name}"] = ClassifierChain(clf, order='random', random_state=42)
print(models.keys())

In [None]:
# -----------------------------
# Fonction pour entra√Æner et pr√©dire
# -----------------------------
def train_and_predict(model, X_train, y_train, X_val):
    """
    Entra√Æne le mod√®le et retourne les pr√©dictions sur X_val
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return y_pred

In [None]:
def get_target_names(y_true, mlb=None):
    """
    Retourne les noms des labels pour le classification_report.

    - Si y_true est un DataFrame -> colonnes
    - Si y_true est un ndarray et mlb fourni -> mlb.classes_
    - Sinon -> tag_0, tag_1, ...
    """
    if hasattr(y_true, "columns"):
        return y_true.columns.tolist()
    elif isinstance(y_true, (np.ndarray)) and mlb is not None:
        return mlb.classes_.tolist()
    elif isinstance(y_true, (np.ndarray)):
        return [f"tag_{i}" for i in range(y_true.shape[1])]
    else:
        raise ValueError("Impossible de d√©terminer les noms des labels. Fournissez y_true ou mlb correctement.")


In [None]:
# -----------------------------
# Fonction pour calculer et afficher les m√©triques
# -----------------------------
def evaluate_multilabel(y_true, y_pred, mlb=None):
    """
    Calcule et affiche les m√©triques multi-label et le classification report
    """
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    hamming = hamming_loss(y_true, y_pred)
    subset_acc = accuracy_score(y_true, y_pred)
    prec_micro = precision_score(y_true, y_pred, average='micro')
    rec_micro = recall_score(y_true, y_pred, average='micro')

    print(f"\n‚úÖ Micro F1: {f1_micro:.3f}, Macro F1: {f1_macro:.3f}")
    print(f"Hamming Loss: {hamming:.3f}, Subset Accuracy: {subset_acc:.3f}")
    print(f"Micro Precision: {prec_micro:.3f}, Micro Recall: {rec_micro:.3f}")

    # R√©cup√©rer les noms des labels
    target_names = get_target_names(y_true, mlb)

    print("\n - Classification report par tag :")
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "hamming_loss": hamming,
        "subset_accuracy": subset_acc
    }


In [None]:
# -----------------------------
# Fonction pour sauvegarder le mod√®le uniquement
# -----------------------------
def save_model(model, save_dir, model_name="best_model.joblib"):
    """
    Sauvegarde le mod√®le dans le r√©pertoire `save_dir` avec le nom `model_name`.
    Cr√©e le dossier si n√©cessaire.
    """
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    model_path = save_dir / model_name
    joblib.dump(model, model_path)

    print(f"‚úÖ Mod√®le sauvegard√© ici : {model_path}")
    return model_path


In [None]:
# -----------------------------
# Entra√Æner et √©valuer tous les mod√®les
# -----------------------------
results_1 = []
trained_models = {}

for model_name, model in models.items():
    # Entra√Æner et pr√©dire
    print(f"\nüöÄ Training {model_name}...")
    y_pred = train_and_predict(model, X_train_tfidf, y_train, X_val_tfidf)

    # √âvaluer le mod√®le et r√©cup√©rer les m√©triques
    metrics = evaluate_multilabel(y_val, y_pred, mlb=mlb)

    # Ajouter les m√©triques au tableau des r√©sultats
    results_1.append({
        "model": model_name,
        "f1_micro": metrics["f1_micro"],
        "f1_macro": metrics["f1_macro"],
        "hamming_loss": metrics["hamming_loss"],
        "subset_accuracy": metrics["subset_accuracy"]
    })

    # Stocker le mod√®le entra√Æn√©
    trained_models[model_name] = model


In [None]:
# -----------------------------
# 5. R√©sultats tri√©s par Macro F1
# -----------------------------
results_df_1 = pd.DataFrame(results_1).sort_values(by='f1_macro', ascending=False)
print("\n=== R√©sultats finaux tri√©s par Macro F1 ===")
results_df_1


In [None]:
# -----------------------------
# Identifier le meilleur mod√®le (Macro F1)
# -----------------------------
best_model_info_1 = results_df_1.iloc[0]           # premi√®re ligne = meilleur mod√®le
best_model_name_1 = best_model_info_1["model"]    # nom du mod√®le
best_model_1 = models[best_model_name_1]          # r√©cup√©rer l'objet mod√®le

print(f"\nüèÜ Meilleur mod√®le : {best_model_name_1}")
print(f"Macro F1 : {best_model_info_1['f1_macro']:.3f}, Micro F1 : {best_model_info_1['f1_micro']:.3f}")

print("Hyperparam√®tres du meilleur mod√®le :")
best_params_1 = best_model_1.get_params()
print(best_params_1)

In [None]:
# -----------------------------
# Sauvegarder le mod√®le
# -----------------------------
save_dir_1 = Path("/content/drive/MyDrive/codeforces_classifier/best_classifier")
save_model(best_model_1, save_dir_1,best_model_name_1)


### 5. Optimisation des hyperparam√®tres

<p>
Une recherche par grille (<code>GridSearchCV</code>) est r√©alis√©e
sur le mod√®le le plus performant afin d‚Äôoptimiser les hyperparam√®tres,
en utilisant le <strong>Micro F1-score</strong> comme m√©trique principale.
</p>

<p>
Cette √©tape permet d‚Äôam√©liorer la g√©n√©ralisation du mod√®le
tout en prenant en compte le d√©s√©quilibre entre les classes.
</p>


In [None]:
base_model = OneVsRestClassifier(
    LinearSVC(class_weight="balanced", random_state=42)
)


param_grid = {
    "estimator__C": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    "estimator__loss": ["hinge", "squared_hinge"],
    "estimator__max_iter": [1000, 3000],
    "estimator__tol": [1e-4, 1e-3]
}


grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring="f1_micro",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_tfidf, y_train)
print("‚úÖ Meilleurs param√®tres :")
best_params = grid_search.best_params_
print(best_params)

print("\n‚úÖ Meilleur score CV (F1 micro) :")
print(grid_search.best_score_)

In [None]:
best_model_2 = grid_search.best_estimator_

y_pred = best_model_2.predict(X_val_tfidf)

# √âvaluer le mod√®le et r√©cup√©rer les m√©triques
metrics_2 = evaluate_multilabel(y_val, y_pred, mlb=mlb)

In [None]:
# -----------------------------
# Sauvegarder le mod√®le
# -----------------------------
save_dir_2 = Path("/content/drive/MyDrive/codeforces_classifier/grid_search")

save_model(best_model_2, save_dir_2, model_name="best_grid_search_ovr_linear_svc.joblib")



In [None]:
best_params

In [None]:
# -----------------------------
# Fonction principale : pipeline OVR_LinearSVC
# -----------------------------
def run_ovr_linear_svc_pipeline(X_train, y_train, X_val, y_val, best_params, save_dir, model_name="best_ovr_linear_svc.joblib", mlb=None):
    """
    Entra√Æne un OneVsRest LinearSVC avec les meilleurs hyperparam√®tres,
    √©value les m√©triques multi-label et sauvegarde le mod√®le.

    Arguments :
        X_train, y_train : donn√©es d'entra√Ænement
        X_val, y_val : donn√©es de validation
        best_params : dict des meilleurs hyperparam√®tres pour LinearSVC
        save_dir : r√©pertoire pour sauvegarder le mod√®le
        model_name : nom du fichier sauvegard√©
        mlb : MultiLabelBinarizer, optionnel si y_val est ndarray

    Retourne :
        best_model : mod√®le entra√Æn√©
        metrics : dict des m√©triques
    """
    # Si les cl√©s contiennent 'estimator__', on les retire
    clean_params = {k.replace("estimator__", ""): v for k, v in best_params.items()}

    # Cr√©er le mod√®le
    best_model = OneVsRestClassifier(
        LinearSVC(class_weight="balanced", **clean_params)
    )

    # Entra√Æner et pr√©dire
    print(f"\nüöÄ Training OVR_LinearSVC with params: {best_params}")
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_val)

    # √âvaluer
    metrics = evaluate_multilabel(y_val, y_pred, mlb)

    # Sauvegarder le mod√®le
    save_dir = Path(save_dir)
    save_model(best_model,save_dir, model_name)

    return best_model, metrics

### 6. Approche hybride : texte + code source

<p>
En compl√©ment du texte, des features binaires sont extraites
directement du code source afin de capturer des indices algorithmiques explicites :
</p>

<ul>
  <li>pr√©sence de DFS/BFS,</li>
  <li>utilisation du modulo ou de la r√©cursion,</li>
  <li>structures de graphes ou d‚Äôarbres,</li>
  <li>√©l√©ments li√©s aux probabilit√©s ou aux jeux.</li>
</ul>

<p>
Les features issues du code sont concat√©n√©es aux vecteurs TF-IDF
afin de former une repr√©sentation hybride plus expressive.
</p>


In [None]:
# def extract_code_features(code):
#     """
#     Extraire des features pertinentes pour pr√©dire les tags algorithmique.
#     """
#     features = {}

#     # -----------------------------
#     # 1Ô∏è‚É£ Structures math√©matiques
#     # -----------------------------
#     features['has_mod'] = int('%' in code)                    # modulo
#     features['has_pow'] = int('**' in code or 'pow(' in code) # exponentiation
#     features['has_factorial'] = int('fact' in code)           # factorielle
#     features['has_comb'] = int('comb' in code)                # combinaisons
#     features['has_math_import'] = int('import math' in code)

#     # -----------------------------
#     # Graphes / parcours
#     # -----------------------------
#     features['has_dfs'] = int('dfs' in code.lower())          # DFS
#     features['has_bfs'] = int('bfs' in code.lower())          # BFS
#     features['has_edges'] = int('edges' in code.lower())
#     features['has_adj'] = int('adj' in code.lower())          # adjacency
#     features['has_graph_list'] = int('graph' in code.lower())

#     # -----------------------------
#     # R√©cursion / structures arborescentes
#     # -----------------------------
#     features['has_recursion'] = int('def' in code and code.count('def') > 1)
#     features['has_tree'] = int('tree' in code.lower())

#     # -----------------------------
#     # Cha√Ænes et manipulation de strings
#     # -----------------------------
#     features['has_string'] = int('str(' in code or '"' in code or "'" in code)
#     # features['has_split'] = int('.split(' in code)
#     features['has_join'] = int('.join(' in code)

#     # -----------------------------
#     # Jeux / probabilit√©s
#     # -----------------------------
#     features['has_random'] = int('random' in code)           # tirage al√©atoire
#     features['has_probability'] = int('prob' in code.lower() or 'chance' in code.lower())

#     # -----------------------------
#     # Boucles / it√©rations
#     # -----------------------------
#     # features['has_for'] = int('for ' in code)
#     features['has_while'] = int('while ' in code)
#     # features['has_nested_loops'] = int(code.count('for ') + code.count('while ') > 1)

#     # -----------------------------
#     # Listes / tableaux
#     # -----------------------------
#     # features['has_list'] = int('[' in code and ']' in code)
#     features['has_append'] = int('.append(' in code)

#     return list(features.values())


In [None]:
import re

def extract_code_features(code: str):
    code_l = code.lower()
    features = {}

    # =====================================================
    # MATH / NUMBER THEORY
    # =====================================================
    features["has_mod"] = int("%" in code)
    features["has_pow"] = int("**" in code or "pow(" in code)
    features["has_gcd"] = int("gcd" in code_l)
    features["has_lcm"] = int("lcm" in code_l)
    features["has_prime"] = int("prime" in code_l)
    features["has_factorial"] = int("fact" in code_l)
    features["has_math_import"] = int("import math" in code_l)
    features["has_bit_ops"] = int(any(op in code for op in ["<<", ">>", "&", "|", "^"]))

    # =====================================================
    # GRAPHS
    # =====================================================
    features["has_dfs"] = int("dfs" in code_l)
    features["has_bfs"] = int("bfs" in code_l)
    features["has_adj_list"] = int("adj" in code_l or "neighbors" in code_l)
    features["has_edges"] = int("edges" in code_l)
    features["has_queue"] = int("deque" in code_l or "queue" in code_l)
    features["has_stack"] = int("stack" in code_l)
    features["has_visited"] = int("visited" in code_l)

    # =====================================================
    # TREES
    # =====================================================
    features["has_tree"] = int("tree" in code_l)
    features["has_node"] = int("node" in code_l)
    features["has_left_right"] = int("left" in code_l and "right" in code_l)
    features["has_recursion"] = int(code.count("def") > 1)
    features["has_depth"] = int("depth" in code_l or "height" in code_l)

    # =====================================================
    # STRINGS
    # =====================================================
    features["has_string_literal"] = int(bool(re.search(r"['\"]", code)))
    features["has_split"] = int(".split(" in code)
    features["has_join"] = int(".join(" in code)
    features["has_replace"] = int(".replace(" in code)
    features["has_substring"] = int("substr" in code_l or "substring" in code_l)
    features["has_ord_chr"] = int("ord(" in code or "chr(" in code)

    # =====================================================
    # PROBABILITIES
    # =====================================================
    features["has_probability"] = int("prob" in code_l or "chance" in code_l)
    features["has_fraction"] = int("fraction" in code_l)
    features["has_float_div"] = int("/" in code and "//" not in code)
    features["has_expectation"] = int("expect" in code_l)

    # =====================================================
    # GAMES
    # =====================================================
    features["has_random"] = int("random" in code_l)
    features["has_turn"] = int("turn" in code_l)
    features["has_player"] = int("player" in code_l)
    features["has_score"] = int("score" in code_l)
    features["has_game_dp"] = int("dp" in code_l and ("win" in code_l or "lose" in code_l))

    # =====================================================
    # GEOMETRY
    # =====================================================
    features["has_point"] = int("point" in code_l)
    features["has_distance"] = int("dist" in code_l)
    features["has_angle"] = int("angle" in code_l)
    features["has_cross_product"] = int("cross" in code_l)
    features["has_dot_product"] = int("dot" in code_l)
    features["has_hypot"] = int("hypot" in code_l)

    return list(features.values())


In [None]:
y

In [None]:
# -----------------------------
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tags"])


# -----------------------------
# Split train/val
# -----------------------------
X_train_text, X_val_text, y_train, y_val, train_idx, val_idx = train_test_split(
    df["description_clean"].tolist(),
    y,
    np.arange(len(df)),  # indices positionnels
    test_size=0.2,
    random_state=42
)

In [None]:
# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_val_tfidf   = vectorizer.transform(X_val_text)

# Extraire features code
X_train_code_features = np.array([extract_code_features(c) for c in df.iloc[train_idx]["source_code"]])
X_val_code_features   = np.array([extract_code_features(c) for c in df.iloc[val_idx]["source_code"]])

# Convertir en sparse matrix
X_train_code_sparse = csr_matrix(X_train_code_features)
X_val_code_sparse   = csr_matrix(X_val_code_features)

# Combiner TF-IDF + code features
X_train_combined = hstack([X_train_tfidf, X_train_code_sparse])
X_val_combined   = hstack([X_val_tfidf, X_val_code_sparse])

print("‚úÖ Shape X_train_combined :", X_train_combined.shape)
print("‚úÖ Shape X_val_combined   :", X_val_combined.shape)


In [None]:
save_dir_3 = "/content/drive/MyDrive/codeforces_classifier/code_source"

best_model_3, metrics_3 = run_ovr_linear_svc_pipeline(
    X_train_combined,
    y_train,
    X_val_combined,
    y_val,
    best_params,
    save_dir_3,
    model_name="best_grid_search_ovr_linear_svc.joblib",
    mlb=mlb
)


### SMOTE

In [None]:
df.head(2)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["tags"])


In [None]:
dtrain = pd.concat(
    [
        df[["description_clean"]].reset_index(drop=True),
        pd.DataFrame(
            y,
            columns=mlb.classes_
        )
    ],
    axis=1
)

In [None]:
dtrain.head(1)

In [None]:
# X = dtrain['description_clean']
y = dtrain[TARGET_TAGS]

In [None]:
y

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(dtrain['description_clean'])

In [None]:
X.shape

In [None]:
# Split dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def irpl(y : pd.DataFrame) -> pd.Series:
    labels_count = y.sum(axis=0)
    return labels_count.max() / labels_count

In [None]:
irpls = irpl(y_train)
print(irpls)

In [None]:
tail_labels = irpls.index[irpls > irpls.mean()]
print(tail_labels)

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

class MLSMOTE:

    def __init__(self,
        n_neighbors : int = 5,
        alpha_scale : float = 0.25,
    ) -> None:

        self.irpls = None
        self.tail_labels = None
        self.nn = None
        self.labels_count = None

        self.n_neighbors = n_neighbors
        self.alpha_scale = alpha_scale

    def _irpl(self, y : pd.DataFrame) -> pd.Series:
        labels_count = y.sum(axis=0)
        return labels_count.max() / labels_count

    def _tail_labels(self, y : pd.DataFrame) -> list[str]:
        irpls = self._irpl(y)
        return irpls.index[irpls > irpls.mean()].to_list()

    def _labels_count(self) -> dict:

        irpls = self.irpls
        p = irpls / irpls.sum()
        p = p ** -self.alpha_scale
        p = p / p.sum()
        p = p * len(y)
        p = p.astype(int)
        p = p[self.tail_labels].to_dict()

        return p

    def fit_resample(self, X : pd.DataFrame, y : pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:

        ### Compute the IRPL for each label
        irpls = self._irpl(y)
        self.irpls = irpls

        ### Get the tail labels
        tail_labels = self._tail_labels(y)
        self.tail_labels = tail_labels

        ### Calculate the number of synthetic samples to generate for each tail label
        labels_count = self._labels_count()
        self.labels_count = labels_count

        ### Restrict X and y to instances with tail labels
        index = X.index[y[tail_labels].sum(axis=1) > 0]
        subset_X = X.loc[index].reset_index(drop=True)
        subset_y = y.loc[index].reset_index(drop=True)

        ### Get the neighbors for each sample in the subset
        nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn = nn.fit(X)
        neighbors = nn.kneighbors(subset_X, return_distance=False)

        ### Generate synthetic samples
        X_synth = [X]
        y_synth = [y]

        for tail_label, count in labels_count.items():

            # 1- Pick random reference samples
            indices = np.arange(len(subset_X))
            indices = indices[subset_y[tail_label] == 1]
            reference_indices = np.random.choice(indices, count)

            # 2- Pick random neighbors of the reference samples
            random_neighbors = np.random.randint(1, self.n_neighbors, count)
            random_neighbors = neighbors[reference_indices, random_neighbors]

            # 3- Compute the difference between the reference samples and their neighbors
            gap = subset_X.iloc[reference_indices].values - X.iloc[random_neighbors].values

            # 4- Compute the synthetic samples features X
            ratio = np.random.rand(count, 1)
            X_new = subset_X.loc[reference_indices].values + ratio * gap
            X_new = pd.DataFrame(X_new, columns=subset_X.columns)

            # 5- Compute the synthetic samples labels y
            y_new = y.values[neighbors[reference_indices,:].flatten()]
            y_new = y_new.reshape(count, self.n_neighbors, subset_y.shape[1])
            y_new = y_new.sum(axis=1) > 0
            y_new = y_new.astype(int)
            y_new = pd.DataFrame(y_new, columns=subset_y.columns)

            # 6- Append the synthetic samples to the original dataset
            X_synth.append(X_new)
            y_synth.append(y_new)

        X_synth = pd.concat(X_synth)
        y_synth = pd.concat(y_synth)

        return X_synth, y_synth

In [None]:
mlsmote = MLSMOTE(alpha_scale=0.1)

X_train_dense = X_train.toarray()  # Convert sparse to dense
X_train_resampled, y_train_resampled = mlsmote.fit_resample(pd.DataFrame(X_train_dense, index=y_train.index), y_train)


In [None]:
X_val_dense = X_val.toarray()

In [None]:
irpls = irpl(y_train_resampled)
print(irpls)

In [None]:
save_dir_4 = "/content/drive/MyDrive/codeforces_classifier/smote_model"

best_model_4, metrics_4 = run_ovr_linear_svc_pipeline(
    X_train_resampled,
    y_train_resampled,
    X_val_dense,
    y_val,
    best_params,
    save_dir_4,
    model_name="ovr_linear_svc.joblib",
)


### <h2 id="7-comparaison-des-performances">7. Comparaison des performances</h2>

<p>
Les mod√®les sont compar√©s selon plusieurs crit√®res :
</p>

<ul>
  <li>Micro F1-score (performance globale),</li>
  <li>Macro F1-score (√©quit√© entre les tags),</li>
  <li>Hamming Loss,</li>
  <li>Performances par tag.</li>
</ul>

<p>
Cette analyse permet d‚Äôidentifier les forces et limites de chaque approche,
ainsi que les tags les mieux pr√©dits.
</p>


In [None]:
def train_evaluate_model(X_train, y_train, X_val, y_val, best_params, mlb=None):
    """
    Entra√Æne OVR_LinearSVC et retourne les m√©triques et pr√©dictions.
    """
    # Nettoyer params si GridSearch
    clean_params = {k.replace("estimator__", ""): v for k, v in best_params.items()}

    model = OneVsRestClassifier(LinearSVC(class_weight="balanced", **clean_params))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    metrics = evaluate_multilabel(y_val, y_pred, mlb)

    return model, y_pred, metrics


In [None]:
def prepare_dataset(df, text_column="description_clean", code_column=None, tags_column="tags",
                    tfidf_max_features=5000, tfidf_ngram_range=(1,2), test_size=0.2, random_state=42):
    """
    Pr√©pare les donn√©es pour le multi-label classification.
    """
    # -----------------------------
    # Binarisation des tags
    # -----------------------------
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df[tags_column])

    # -----------------------------
    # Split train/val
    # -----------------------------
    train_idx, val_idx = train_test_split(np.arange(len(df)), test_size=test_size, random_state=random_state)
    X_train_text = [df.iloc[i][text_column] for i in train_idx]
    X_val_text   = [df.iloc[i][text_column] for i in val_idx]
    y_train = y[train_idx]
    y_val   = y[val_idx]

    # -----------------------------
    # TF-IDF sur le texte
    # -----------------------------
    vectorizer = TfidfVectorizer(max_features=tfidf_max_features, ngram_range=tfidf_ngram_range)
    X_train_tfidf = vectorizer.fit_transform(X_train_text)
    X_val_tfidf   = vectorizer.transform(X_val_text)

    # -----------------------------
    # Option : features du code source
    # -----------------------------
    if code_column is not None:
        X_train_code_features = np.array([extract_code_features(df.iloc[i][code_column]) for i in train_idx])
        X_val_code_features   = np.array([extract_code_features(df.iloc[i][code_column]) for i in val_idx])

        X_train_code_sparse = csr_matrix(X_train_code_features)
        X_val_code_sparse   = csr_matrix(X_val_code_features)

        # Combiner TF-IDF + features code
        X_train_combined = hstack([X_train_tfidf, X_train_code_sparse])
        X_val_combined   = hstack([X_val_tfidf, X_val_code_sparse])
    else:
        X_train_combined = X_train_tfidf
        X_val_combined   = X_val_tfidf

    print("‚úÖ Shape X_train_tfidf :", X_train_tfidf.shape)
    print("‚úÖ Shape X_val_tfidf   :", X_val_tfidf.shape)
    print("‚úÖ Shape X_train_combined :", X_train_combined.shape)
    print("‚úÖ Shape X_val_combined   :", X_val_combined.shape)

    return X_train_tfidf, X_val_tfidf, X_train_combined, X_val_combined, y_train, y_val, mlb, train_idx, val_idx


In [None]:
# Pr√©paration des donn√©es (TF-IDF seul + TF-IDF + features code)
X_train_tfidf, X_val_tfidf, X_train_combined, X_val_combined, y_train, y_val, mlb, train_idx, val_idx = prepare_dataset(
    df,
    text_column="description_clean",
    code_column="source_code",  # None si pas de features code
    tags_column="tags",
    tfidf_max_features=5000,
    tfidf_ngram_range=(1,2),
    test_size=0.2,
    random_state=42
)


In [None]:
y_df = pd.DataFrame(y_train, columns=mlb.classes_)

mlsmote = MLSMOTE(alpha_scale=0.1)

# Convertir X_train en dense si n√©cessaire
X_train_dense = X_train_tfidf.toarray()  # TF-IDF seul

# Resample
X_train_resampled, y_train_resampled = mlsmote.fit_resample(
    pd.DataFrame(X_train_dense),
    y_df  # DataFrame avec colonnes = noms des tags
)

# Conversion pour validation
X_val_dense = X_val_tfidf.toarray() if not isinstance(X_val_tfidf, np.ndarray) else X_val_tfidf

In [None]:
results_comparison = []

# Mod√®le avec seulement les descriptions
model_desc, y_pred_desc, metrics_desc = train_evaluate_model(
    X_train_tfidf, y_train, X_val_tfidf, y_val, best_params, mlb
)
results_comparison.append({"approach": "Descriptions only", **metrics_desc})

# Mod√®le avec description + features du code source
model_combined, y_pred_combined, metrics_combined = train_evaluate_model(
    X_train_combined, y_train, X_val_combined, y_val, best_params, mlb
)
results_comparison.append({"approach": "Descriptions + code features", **metrics_combined})

# Mod√®le avec SMOTE / resampling
model_smote, y_pred_smote, metrics_smote = train_evaluate_model(
    X_train_resampled, y_train_resampled, X_val_dense, y_val, best_params, mlb
)
results_comparison.append({"approach": "Resampled (SMOTE)", **metrics_smote})


In [None]:
results_df = pd.DataFrame(results_comparison).set_index("approach")

# Trier par f1_macro d√©croissant
results_df = results_df.sort_values(by="f1_macro", ascending=False)

print("\n=== Comparaison des approches tri√©e par Macro F1 ===")
display(results_df)


In [None]:
def plot_results(results: pd.DataFrame) -> None:
    n_metrics = len(results.columns)
    n_cols = 2
    n_rows = int(np.ceil(n_metrics / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 5*n_rows))
    axes = axes.flatten()

    for i, metric in enumerate(results.columns):
        sns.barplot(data=results, x=results.index, y=metric, ax=axes[i])
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
        axes[i].set_title(metric)

    # cacher les axes inutilis√©s
    for j in range(i+1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    plt.show()


In [None]:
# Visualisation simple
plot_results(results_df)

In [None]:
def plot_confusion_matrices(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    n_labels = len(y_true.columns)
    n_cols = 4  # nombre de colonnes par ligne
    n_rows = int(np.ceil(n_labels / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    axes = axes.flatten()  # pour it√©rer facilement

    for i, label in enumerate(y_true.columns):
        cm = confusion_matrix(y_true[label], y_pred[label])
        cm = cm.astype(float) / cm.sum(axis=1)[:, None]  # normalisation
        sns.heatmap(cm, ax=axes[i], annot=True, fmt=".2f", cmap="Blues")
        axes[i].set_title(f"{label}")

    plt.tight_layout()
    plt.show()


In [None]:
best_approach = results_df["f1_macro"].idxmax()
print(f"\nüéØ Meilleur mod√®le : {best_approach}")

if best_approach == "Descriptions only":
    y_pred_best = y_pred_desc
elif best_approach == "Descriptions + code features":
    y_pred_best = y_pred_combined
else:
    y_pred_best = y_pred_smote

# Conversion en DataFrame avec noms des tags
y_val_df = pd.DataFrame(y_val, columns=mlb.classes_)

# Plot confusion matrices pour le meilleur mod√®le
plot_confusion_matrices(y_val_df, pd.DataFrame(y_pred_best, columns=mlb.classes_))


In [None]:
def plot_results(results: pd.DataFrame) -> None:
    n_metrics = len(results.columns)
    n_cols = 2
    n_rows = int(np.ceil(n_metrics / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 5*n_rows))
    axes = axes.flatten()

    for i, metric in enumerate(results.columns):
        sns.barplot(data=results, x=results.index, y=metric, ax=axes[i])
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
        axes[i].set_title(metric)

    # cacher les axes inutilis√©s
    for j in range(i+1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    plt.show()


In [None]:
def evaluate_label_wise(y_true: pd.DataFrame, y_pred: pd.DataFrame, metrics: dict) -> pd.DataFrame:
    """
    √âvalue chaque label/tag s√©par√©ment selon les m√©triques fournies.

    Arguments :
        y_true : DataFrame avec les vrais labels
        y_pred : DataFrame avec les pr√©dictions
        metrics : dict {nom_m√©trique: fonction} pour calculer les m√©triques

    Retourne :
        DataFrame avec les m√©triques pour chaque label et une ligne 'Mean'
    """
    results = []

    for label in y_true.columns:
        row = {}
        for metric_name, metric_func in metrics.items():
            row[metric_name] = metric_func(y_true[label], y_pred[label])
        results.append(row)

    results_df = pd.DataFrame(results, index=y_true.columns)
    results_df.loc['Mean'] = results_df.mean(axis=0)

    return results_df


In [None]:
metrics_dict = {
    "F1": lambda y_t, y_p: f1_score(y_t, y_p, zero_division=0),
    "Precision": lambda y_t, y_p: precision_score(y_t, y_p, zero_division=0),
    "Recall": lambda y_t, y_p: recall_score(y_t, y_p, zero_division=0),
    "Accuracy": lambda y_t, y_p: accuracy_score(y_t, y_p)
}


In [None]:
y_pred_best_df = pd.DataFrame(y_pred_best, columns= mlb.classes_)


In [None]:
# Calcul m√©triques par label
moc_results = evaluate_label_wise(y_val_df, y_pred_best_df, metrics_dict)

print(f"\nüéØ M√©triques d√©taill√©es par label pour le meilleur mod√®le ({best_approach}) :")
display(moc_results)

In [None]:
plot_results(moc_results)