In [1]:
from preprocessing import Preprocessor
from data_loader import load_json_data
from evaluation import compute_f1_scores, per_tag_f1
from baseline import tf_idf
import torch
import numpy as np

In [2]:
focus_tags=['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
path=r"C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\data\code_classification_dataset.zip"
data=load_json_data(path)

In [3]:
preprocessor=Preprocessor(use_code=True,focus_only=True)
train_text, dev_text, test_text, train_labels_bin, dev_labels_bin, test_labels_bin=preprocessor.process_data(data,devset=True,test_size=0.2) # type: ignore
mlb_classes=preprocessor.label_order


=== CHECKING SPLIT BALANCE ===

=== Focus Tags Statistics ===
math            | train: 1157 dev: 128 test: 124
graphs          | train: 435 dev:  54 test:  53
strings         | train: 344 dev:  48 test:  30
number theory   | train: 285 dev:  31 test:  34
trees           | train: 252 dev:  36 test:  36
geometry        | train: 131 dev:  17 test:  18
games           | train:  82 dev:  11 test:  12
probabilities   | train:  73 dev:  12 test:   7
TRAIN SIZE: 3985
DEV SIZE: 498
TEST SIZE: 499




# TF IDF baseline model

First baseline mdoel is an invese frequency model, the data is whole descriptions or source codes and targets are the corresponding tags, minimal preprocessing is applied (cleaning text)

We see great imbalances in classes, need to tune threshold per classes for best results probably

In [5]:
model_tf=tf_idf(classes=mlb_classes)
model_tf.fit(train_text,train_labels_bin)
y_true_binary=test_labels_bin

In [6]:
# Prediction using base threshold p=0.5

y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.55),
 ('graphs', 0.47),
 ('strings', 0.63),
 ('number theory', 0.29),
 ('trees', 0.66),
 ('geometry', 0.11),
 ('games', 0.4),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]


[('math', 0.53),
 ('graphs', 0.51),
 ('strings', 0.48),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

In [7]:
# Tune threshold globally to get better results, most classes are low count so p=0.5 is quite bad 

best_thresh,best_f1=model_tf.tune_threshold(dev_text,dev_labels_bin,depth=10)

y_pred_binary=model_tf.predict(test_text)
per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.41),
 ('graphs', 0.45),
 ('strings', 0.64),
 ('number theory', 0.53),
 ('trees', 0.61),
 ('geometry', 0.5),
 ('games', 0.67),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]

[('math', 0.45),
 ('graphs', 0.58),
 ('strings', 0.68),
 ('number theory', 0.48),
 ('trees', 0.63),
 ('geometry', 0.36),
 ('games', 0.48),
 ('probabilities', 0.0)]
 

In [8]:
# Tune threshold per class to get best results

best_thresholds,best_f1_per_class=model_tf.tune_per_tag_threshold(dev_text,dev_labels_bin,depth=20)
y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.55),
 ('graphs', 0.53),
 ('strings', 0.71),
 ('number theory', 0.57),
 ('trees', 0.68),
 ('geometry', 0.5),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.54),
 ('strings', 0.63),
 ('number theory', 0.47),
 ('trees', 0.62),
 ('geometry', 0.58),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.6),
 ('strings', 0.63),
 ('number theory', 0.48),
 ('trees', 0.65),
 ('geometry', 0.47),
 ('games', 0.68),
 ('probabilities', 0.0)]

# BERT model

In [2]:
from preprocessing import BertPreprocessor
from model import CodeBERTClassifier
from transformers import AutoTokenizer

MAX_LEN = 512   # truncating at Description+ code at 512 token so losing only code, 
                # might be better to trunk description and code at 256 because description might be very long
BATCH_SIZE = 16 
MODEL_NAME = 'microsoft/codebert-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [11]:
bpreprocessor=BertPreprocessor(tokenizer=tokenizer,max_len=MAX_LEN,batch_size=BATCH_SIZE,use_code=False)
train_loader = bpreprocessor.preprocess_data(train_text,train_labels_bin,shuffle=True)
dev_loader = bpreprocessor.preprocess_data(dev_text,dev_labels_bin,shuffle=False)
test_loader = bpreprocessor.preprocess_data(test_text,test_labels_bin,shuffle=False)

pos_weights=bpreprocessor.get_pos_weight(train_labels_bin)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeBERTClassifier(mlb_classes=mlb_classes, model_name=MODEL_NAME)
model = model.to(device)

print(f"Modèle chargé sur {device}")

Modèle chargé sur cpu


In [None]:
history=model.fit(train_loader,dev_loader, epochs=4, lr=2e-5,save_path='best_model.pt',pos_weight=pos_weights)
model.tune_thresholds(dev_loader, device='cuda')
print("\n--- Lancement de l'évaluation ---")
global_metrics, tag_results = model.evaluate_model(
    data_loader=test_loader,  # Utilise ton loader de test ici
    device='cuda',
    target_tags=focus_tags       # Les tags sur lesquels tu veux le détail
)

# Print results
print("\nRésumé F1 Score Global (Micro):", global_metrics['micro_f1'])
print("Résumé F1 Score Global (Macro):", global_metrics['macro_f1'])


AssertionError: Torch not compiled with CUDA enabled