In [None]:
!pip install -q transformers datasets torch scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from transformers import EarlyStoppingCallback

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
device = "cuda" if torch.cuda.is_available() else "cpu"



In [None]:
df = pd.read_csv("/content/smells.csv")
label_counts = df["LABEL"].value_counts()
target_size = label_counts.min()

df_balanced = (
    df.groupby("LABEL", group_keys=False)
      .apply(lambda x: resample(
          x,
          replace=False,
          n_samples=target_size,
          random_state=31
      ))
      .sample(frac=1, random_state=31)
      .reset_index(drop=True)
)

print(df_balanced["LABEL"].value_counts())

train_df, temp_df = train_test_split(df_balanced, test_size=0.3, random_state=31, stratify=df_balanced["LABEL"])
valid_df, test_df = train_test_split(temp_df, test_size=2/3, random_state=31, stratify=temp_df["LABEL"])

LABEL
1    161
0    161
Name: count, dtype: int64


  .apply(lambda x: resample(


In [None]:
MODEL_NAME = "microsoft/codebert-base"
MAX_LEN = 512

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
class CodeDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.codes = df["CODE"].astype(str).tolist()
        self.labels = df["LABEL"].astype(int).tolist()
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        enc = self.tok(
            self.codes[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = CodeDataset(train_df, tokenizer, MAX_LEN)
test_ds  = CodeDataset(test_df,  tokenizer, MAX_LEN)
valid_ds = CodeDataset(valid_df, tokenizer, MAX_LEN)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted", zero_division=0),
        "precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "recall": recall_score(labels, preds, average="weighted", zero_division=0)
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./codebert_output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6662,0.569353,0.71875,0.711712,0.742424,0.71875
2,0.4965,0.570949,0.75,0.74902,0.753968,0.75
3,0.3704,1.128534,0.75,0.74902,0.753968,0.75
4,0.194,1.426651,0.78125,0.781036,0.782353,0.78125
5,0.0463,1.829157,0.75,0.746032,0.766667,0.75
6,0.0723,2.089973,0.71875,0.716256,0.726721,0.71875
7,0.0005,1.782144,0.78125,0.781036,0.782353,0.78125
8,0.0008,1.794717,0.78125,0.781036,0.782353,0.78125
9,0.0003,1.877599,0.78125,0.781036,0.782353,0.78125




TrainOutput(global_step=513, training_loss=0.20526408329678675, metrics={'train_runtime': 14473.5607, 'train_samples_per_second': 0.233, 'train_steps_per_second': 0.059, 'total_flos': 532799887104000.0, 'train_loss': 0.20526408329678675, 'epoch': 9.0})

In [None]:
model.save_pretrained("./codebert_trained")
tokenizer.save_pretrained("./codebert_trained")

('./codebert_trained/tokenizer_config.json',
 './codebert_trained/special_tokens_map.json',
 './codebert_trained/vocab.json',
 './codebert_trained/merges.txt',
 './codebert_trained/added_tokens.json',
 './codebert_trained/tokenizer.json')

In [None]:
metrics = trainer.evaluate(test_ds)

predictions = trainer.predict(test_ds)
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1)

report = classification_report(
    labels,
    preds,
    target_names=["Clean Code", "Code Smell"],
    output_dict=True,
    zero_division=0
)

df_classes = pd.DataFrame(report).transpose()

display(df_classes)





Unnamed: 0,precision,recall,f1-score,support
Clean Code,0.692308,0.818182,0.75,33.0
Code Smell,0.769231,0.625,0.689655,32.0
accuracy,0.723077,0.723077,0.723077,0.723077
macro avg,0.730769,0.721591,0.719828,65.0
weighted avg,0.730178,0.723077,0.720292,65.0
