In [1]:
from preprocessing import Preprocessor
from data_loader import load_json_data
from evaluation import compute_f1_scores, per_tag_f1
from baseline import tf_idf
import numpy as np
import pandas as pd
import torch

In [2]:
focus_tags=['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
data=load_json_data(r"C:\Users\maloc\OneDrive\Documents\Tag Classification\Code-Tag-Classification\data\code_classification_dataset.zip")

In [14]:
preprocessor=Preprocessor(use_code=False)
train_text, dev_text, test_text, train_labels_bin, dev_labels_bin, test_labels_bin=preprocessor.process_data(data,devset=True,test_size=0.3) # type: ignore
mlb_classes=preprocessor.label_order


=== CHECKING SPLIT BALANCE ===

=== Focus Tags Statistics ===
math            | train: 1012 dev: 194 test: 203
graphs          | train: 385 dev:  81 test:  76
strings         | train: 304 dev:  65 test:  53
number theory   | train: 250 dev:  53 test:  47
trees           | train: 212 dev:  60 test:  52
geometry        | train: 120 dev:  21 test:  25
games           | train:  66 dev:  21 test:  18
probabilities   | train:  65 dev:  16 test:  11
TRAIN SIZE: 3487
DEV SIZE: 747
TEST SIZE: 748


# TF IDF baseline model

First baseline mdoel is an invese frequency model, the data is whole descriptions or source codes and targets are the corresponding tags, minimal preprocessing is applied (cleaning text)

We see great imbalances in classes, need to tune threshold per classes for best results probably

In [None]:
model_tf=tf_idf(classes=mlb_classes)
model_tf.fit(train_text,train_labels_bin)
y_true_binary=test_labels_bin

In [5]:
# Prediction using base threshold p=0.5

y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.54),
 ('graphs', 0.55),
 ('strings', 0.47),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]


[('math', 0.53),
 ('graphs', 0.51),
 ('strings', 0.48),
 ('number theory', 0.22),
 ('trees', 0.54),
 ('geometry', 0.0),
 ('games', 0.2),
 ('probabilities', 0.0)]

In [6]:
# Tune threshold globally to get better results, most classes are low count so p=0.5 is quite bad 

best_thresh,best_f1=model_tf.tune_threshold(dev_text,dev_labels_bin,depth=10)

y_pred_binary=model_tf.predict(test_text)
per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.45),
 ('graphs', 0.58),
 ('strings', 0.68),
 ('number theory', 0.45),
 ('trees', 0.63),
 ('geometry', 0.36),
 ('games', 0.48),
 ('probabilities', 0.0)]

[('math', 0.44),
 ('graphs', 0.42),
 ('strings', 0.63),
 ('number theory', 0.51),
 ('trees', 0.64),
 ('geometry', 0.56),
 ('games', 0.71),
 ('probabilities', 0.0)]

[('math', 0.45),
 ('graphs', 0.58),
 ('strings', 0.68),
 ('number theory', 0.48),
 ('trees', 0.63),
 ('geometry', 0.36),
 ('games', 0.48),
 ('probabilities', 0.0)]
 

In [7]:
# Tune threshold per class to get best results

best_thresholds,best_f1_per_class=model_tf.tune_per_tag_threshold(dev_text,dev_labels_bin,depth=20)
y_pred_binary=model_tf.predict(test_text)

per_tag_f1(y_true_binary,y_pred_binary,focus_tags=focus_tags,mlb_classes=mlb_classes)

[('math', 0.56),
 ('graphs', 0.59),
 ('strings', 0.64),
 ('number theory', 0.47),
 ('trees', 0.64),
 ('geometry', 0.49),
 ('games', 0.65),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.54),
 ('strings', 0.63),
 ('number theory', 0.47),
 ('trees', 0.62),
 ('geometry', 0.58),
 ('games', 0.73),
 ('probabilities', 0.0)]

[('math', 0.55),
 ('graphs', 0.6),
 ('strings', 0.63),
 ('number theory', 0.48),
 ('trees', 0.65),
 ('geometry', 0.47),
 ('games', 0.68),
 ('probabilities', 0.0)]

# BERT model

In [None]:
from preprocessing import CodeforcesDataset,Tensorizer
from model import CodeBERTClassifier
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

MAX_LEN = 512   # truncating at Description+ code at 512 token so sometimes losing only code, 
                # might be better to trunk description and code at 256 because description might be very long
BATCH_SIZE = 16 
MODEL_NAME = 'microsoft/codebert-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [13]:
tensorizer=Tensorizer(tokenizer, max_len=MAX_LEN)
train_dataset=CodeforcesDataset(train_text,train_labels_bin,tensorizer)
dev_dataset=CodeforcesDataset(dev_text,dev_labels_bin,tensorizer)
test_dataset=CodeforcesDataset(test_text,test_labels_bin,tensorizer)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
dev_loader   = DataLoader(dev_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0)


# 5. Instanciation du modèle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeBERTClassifier(n_classes=mlb_classes, model_name=MODEL_NAME)
model = model.to(device)

print(f"Modèle chargé sur {device}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/microsoft/codebert-base/resolve/main/pytorch_model.bin: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


pytorch_model.bin:  53%|#####2    | 262M/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, torch.memory_format memory_format = None, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]