In [1]:
from preprocessing import Preprocessor
from data_loader import load_json_data
from evaluation import compute_f1_scores, per_tag_f1
from baseline import tf_idf
import numpy as np
import pandas as pd
import torch

In [2]:
focus_tags=['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
data=load_json_data(r"C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\data\code_classification_dataset.zip")

In [None]:
preprocessor=Preprocessor(use_code=False)
train_text, dev_text, test_text, train_labels_bin, dev_labels_bin, test_labels_bin=preprocessor.process_data(data,devset=True,test_size=0.2) # type: ignore
mlb_classes=preprocessor.label_order


=== CHECKING SPLIT BALANCE ===

=== Focus Tags Statistics ===
math            | train: 1012 dev: 194 test: 203
graphs          | train: 385 dev:  81 test:  76
strings         | train: 304 dev:  65 test:  53
number theory   | train: 250 dev:  53 test:  47
trees           | train: 212 dev:  60 test:  52
geometry        | train: 120 dev:  21 test:  25
games           | train:  66 dev:  21 test:  18
probabilities   | train:  65 dev:  16 test:  11
TRAIN SIZE: 3487
DEV SIZE: 747
TEST SIZE: 748


# TF IDF baseline model

First baseline mdoel is an invese frequency model, the data is whole descriptions or source codes and targets are the corresponding tags, minimal preprocessing is applied (cleaning text)

We see great imbalances in classes, need to tune threshold per classes for best results probably

In [None]:
model_tf=tf_idf(classes=mlb_classes)
model_tf.fit(train_text,train_labels_bin)
y_true_binary=test_labels_bin

In [5]:
# Prediction using base threshold p=0.5

y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.54),
 ('graphs', 0.55),
 ('strings', 0.47),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]


[('math', 0.53),
 ('graphs', 0.51),
 ('strings', 0.48),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

In [6]:
# Tune threshold globally to get better results, most classes are low count so p=0.5 is quite bad 

best_thresh,best_f1=model_tf.tune_threshold(dev_text,dev_labels_bin,depth=10)

y_pred_binary=model_tf.predict(test_text)
per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.45),
 ('graphs', 0.58),
 ('strings', 0.68),
 ('number theory', 0.45),
 ('trees', 0.63),
 ('geometry', 0.36),
 ('games', 0.48),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]

[('math', 0.45),
 ('graphs', 0.58),
 ('strings', 0.68),
 ('number theory', 0.48),
 ('trees', 0.63),
 ('geometry', 0.36),
 ('games', 0.48),
 ('probabilities', 0.0)]
 

In [7]:
# Tune threshold per class to get best results

best_thresholds,best_f1_per_class=model_tf.tune_per_tag_threshold(dev_text,dev_labels_bin,depth=20)
y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.56),
 ('graphs', 0.59),
 ('strings', 0.64),
 ('number theory', 0.47),
 ('trees', 0.64),
 ('geometry', 0.49),
 ('games', 0.65),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.54),
 ('strings', 0.63),
 ('number theory', 0.47),
 ('trees', 0.62),
 ('geometry', 0.58),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.6),
 ('strings', 0.63),
 ('number theory', 0.48),
 ('trees', 0.65),
 ('geometry', 0.47),
 ('games', 0.68),
 ('probabilities', 0.0)]

# BERT model

In [None]:
from preprocessing import BertPreprocessor
from model import CodeBERTClassifier
from transformers import AutoTokenizer

MAX_LEN = 512   # truncating at Description+ code at 512 token so losing only code, 
                # might be better to trunk description and code at 256 because description might be very long
BATCH_SIZE = 16 
MODEL_NAME = 'microsoft/codebert-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ImportError: cannot import name 'BertPreprocessor' from 'preprocessing' (c:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\main\preprocessing.py)

In [None]:
bpreprocessor=BertPreprocessor(tokenizer=tokenizer,max_len=MAX_LEN,batch_size=BATCH_SIZE,use_code=False)
train_loader = bpreprocessor.preprocess_data(train_text,train_labels_bin,shuffle=True)
dev_loader = bpreprocessor.preprocess_data(dev_text,dev_labels_bin,shuffle=False)
testloader = bpreprocessor.preprocess_data(test_text,test_labels_bin,shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeBERTClassifier(n_classes=len(mlb_classes), model_name=MODEL_NAME)
model = model.to(device)

print(f"Modèle chargé sur {device}")

Modèle chargé sur cpu


In [None]:
import torch
import numpy as np
from tqdm import tqdm # Barre de progression

def evaluate_model(model, data_loader, device, mlb_classes, threshold=0.5):
    """
    Exécute le modèle sur le data_loader et calcule les métriques.
    """
    model.eval() # Désactive le Dropout
    
    all_preds = []
    all_labels = []
    
    print("Évaluation en cours...")
    with torch.no_grad(): # Pas de calcul de gradients (économise RAM et CPU)
        for batch in tqdm(data_loader):
            # 1. Envoi sur GPU
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # 2. Prédiction (Forward)
            logits = model(input_ids, mask)
            
            # 3. Conversion Logits -> Probabilités (Sigmoid)
            # Car BCEWithLogitsLoss prend des logits, mais pour l'évaluation on veut des probas
            probs = torch.sigmoid(logits)
            
            # 4. Conversion Probabilités -> Binaire (0 ou 1)
            preds_binary = (probs > threshold).float()
            
            # 5. Stockage (Renvoi sur CPU pour numpy)
            all_preds.append(preds_binary.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    # 6. Concaténation de tous les batchs
    y_pred_numpy = np.vstack(all_preds)
    y_true_numpy = np.vstack(all_labels)
    
    # 7. Appel de tes fonctions d'évaluation
    print("\n--- Métriques Globales ---")
    global_metrics = compute_f1_scores(y_true_numpy, y_pred_numpy)
    print(global_metrics)
    
    print("\n--- Métriques par Tag ---")
    # focus_tags : liste des 8 tags cibles
    target_tags = ['math', 'graphs', 'strings', 'number theory', 
                   'trees', 'geometry', 'games', 'probabilities']
    
    tag_results = per_tag_f1(y_true_numpy, y_pred_numpy, mlb_classes, focus_tags=target_tags)
    
    for tag, score in tag_results:
        print(f"{tag:<15} : {score}")
        
    return global_metrics, tag_results

evaluate_model(model, test_loader, device, mlb_classes)