In [8]:
from preprocessing import Preprocessor
from data_loader import load_json_data
from evaluation import compute_f1_scores, per_tag_f1,compute_detailed_metrics
from baseline import tf_idf
import torch
import numpy as np

In [4]:
focus_tags=['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
path=r"C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\data\code_classification_dataset.zip"
data=load_json_data(path)

In [5]:
preprocessor=Preprocessor(use_code=True,focus_only=True)
train_text, dev_text, test_text, train_labels_bin, dev_labels_bin, test_labels_bin=preprocessor.process_data(data,devset=True,test_size=0.2) # type: ignore
mlb_classes=preprocessor.label_order


=== CHECKING SPLIT BALANCE ===

=== Focus Tags Statistics ===
math            | train: 1157 dev: 128 test: 124
graphs          | train: 435 dev:  54 test:  53
strings         | train: 344 dev:  48 test:  30
number theory   | train: 285 dev:  31 test:  34
trees           | train: 252 dev:  36 test:  36
geometry        | train: 131 dev:  17 test:  18
games           | train:  82 dev:  11 test:  12
probabilities   | train:  73 dev:  12 test:   7
TRAIN SIZE: 3985
DEV SIZE: 498
TEST SIZE: 499




In [22]:
train_text, dev_text, test_text, train_labels_bin, dev_labels_bin, test_labels_bin=preprocessor.process_data(data,devset=True,test_size=0.2,binarize_labels=False)


=== CHECKING SPLIT BALANCE ===

=== Focus Tags Statistics ===
math            | train: 1157 dev: 128 test: 124
graphs          | train: 435 dev:  54 test:  53
strings         | train: 344 dev:  48 test:  30
number theory   | train: 285 dev:  31 test:  34
trees           | train: 252 dev:  36 test:  36
geometry        | train: 131 dev:  17 test:  18
games           | train:  82 dev:  11 test:  12
probabilities   | train:  73 dev:  12 test:   7
TRAIN SIZE: 3985
DEV SIZE: 498
TEST SIZE: 499


In [24]:
set_tot=set()
for liste in train_labels_bin:
    for item in liste:

        set_tot.add(item)

print(set_tot)

{'dp', 'dfs and similar', 'divide and conquer', 'greedy', '2-sat', 'games', 'graph matchings', 'brute force', 'flows', 'geometry', 'schedules', 'fft', 'math', 'constructive algorithms', 'expression parsing', 'implementation', 'bitmasks', '*special', 'matrices', 'binary search', 'probabilities', 'number theory', 'string suffix structures', 'trees', 'combinatorics', 'meet-in-the-middle', 'sortings', 'shortest paths', 'strings', 'data structures', 'two pointers', 'chinese remainder theorem', 'graphs', 'dsu', 'interactive', 'ternary search', 'hashing'}


# TF IDF baseline model

First baseline mdoel is an invese frequency model, the data is whole descriptions or source codes and targets are the corresponding tags, minimal preprocessing is applied (cleaning text)

We see great imbalances in classes, need to tune threshold per classes for best results probably

In [17]:
model_tf=tf_idf(classes=mlb_classes)
model_tf.fit(train_text,train_labels_bin)
y_true_binary=test_labels_bin

In [18]:
# Prediction using base threshold p=0.5

y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.55),
 ('graphs', 0.47),
 ('strings', 0.63),
 ('number theory', 0.29),
 ('trees', 0.66),
 ('geometry', 0.11),
 ('games', 0.4),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]


[('math', 0.53),
 ('graphs', 0.51),
 ('strings', 0.48),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

In [15]:
# Tune threshold globally to get better results, most classes are low count so p=0.5 is quite bad 

best_thresh,best_f1=model_tf.tune_threshold(dev_text,dev_labels_bin,depth=10)

y_pred_binary=model_tf.predict(test_text)
per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.41),
 ('graphs', 0.45),
 ('strings', 0.64),
 ('number theory', 0.53),
 ('trees', 0.61),
 ('geometry', 0.5),
 ('games', 0.67),
 ('probabilities', 0.0)]

In [19]:
compute_detailed_metrics(y_true_binary, y_pred_binary)

{'micro_f1': 0.513,
 'macro_f1': 0.388,
 'jaccard_score': 0.238,
 'precision_macro': 0.67,
 'recall_macro': 0.337}

{'micro_f1': 0.468,
 'macro_f1': 0.477,
 'jaccard_score': 0.297,
 'precision_macro': 0.426,
 'recall_macro': 0.679}

In [7]:
# Tune threshold per class to get best results

best_thresholds,best_f1_per_class=model_tf.tune_per_tag_threshold(dev_text,dev_labels_bin,depth=20)
y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.55),
 ('graphs', 0.53),
 ('strings', 0.71),
 ('number theory', 0.57),
 ('trees', 0.68),
 ('geometry', 0.5),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.54),
 ('strings', 0.63),
 ('number theory', 0.47),
 ('trees', 0.62),
 ('geometry', 0.58),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.6),
 ('strings', 0.63),
 ('number theory', 0.48),
 ('trees', 0.65),
 ('geometry', 0.47),
 ('games', 0.68),
 ('probabilities', 0.0)]

# BERT model

In [20]:
from predict import CodeBERTPredictor
from preprocessing import Preprocessor


model_path=r'C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\models\best_model_with_code_focus.pt'
threshold_path=r'C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\models/best_thresholds_with_code_focus.json'

bert_predictor=CodeBERTPredictor(mlb_classes,model_path=model_path,threshold_path=threshold_path)
bert_predictor.load_prediction_model()


Chargement du modèle sur cpu...
✅ Seuils optimisés chargés.
Chargement du modèle sur cpu...
✅ Seuils optimisés chargés.


In [7]:
test_example="While sailg on a boat, Inessa noticed a beautiful water lily flower above the lake's surface. She came closer and it turned out that the lily was exactly $$$H$$$ centimeters above the water surface. Inessa grabbed the flower and sailed the distance of $$$L$$$ centimeters. Exactly at this point the flower touched the water surface.  Suppose that the lily grows at some point $$$A$$$ on the lake bottom, and its stem is always a straight segment with one endpoint at point $$$A$$$. Also suppose that initially the flower was exactly above the point $$$A$$$, i.e. its stem was vertical. Can you determine the depth of the lake at point $$$A$$$?"

tag_example=[
        "math",
        "geometry"]

bert_predictor.predict_single_text(test_example)

(['geometry'],
 array([0.44218746, 0.5610631 , 0.08963292, 0.06434947, 0.40266448,
        0.866883  , 0.08704699, 0.30152518], dtype=float32))

python src/predict_cli.py --text "text"