In [2]:
!pip install -q transformers datasets torch scikit-learn

import math
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

from torch.utils.data import Dataset
from datasets import Dataset as HFDataset
from google.colab import drive

from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN = 512



In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df_mlm = pd.read_csv("/content/drive/MyDrive/datasets/mlm.csv")
mlm_texts = df_mlm["CODE"].astype(str).tolist()

mlm_train_texts, mlm_val_texts = train_test_split(
    mlm_texts,
    test_size=0.1,
    random_state=31
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_mlm(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

mlm_train_ds = HFDataset.from_dict({"text": mlm_train_texts})
mlm_val_ds   = HFDataset.from_dict({"text": mlm_val_texts})

mlm_train_ds = mlm_train_ds.map(tokenize_mlm, batched=True, remove_columns=["text"])
mlm_val_ds   = mlm_val_ds.map(tokenize_mlm, batched=True, remove_columns=["text"])

mlm_train_ds.set_format("torch")
mlm_val_ds.set_format("torch")

mlm_model = AutoModelForMaskedLM.from_pretrained(
    "microsoft/codebert-base"
).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/1176 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
mlm_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

mlm_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/output/codebert_ts_mlm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    fp16=True,
    report_to="none"
)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=mlm_train_ds,
    eval_dataset=mlm_val_ds,
    data_collator=mlm_collator
)

mlm_trainer.train()

mlm_eval = mlm_trainer.evaluate()
mlm_loss = mlm_eval["eval_loss"]
mlm_perplexity = math.exp(mlm_loss)

print("MLM eval loss:", mlm_loss);
print("MLM perplexity:", mlm_perplexity)

mlm_model.save_pretrained("/content/drive/MyDrive/models/codebert_mlm_ts")
tokenizer.save_pretrained("/content/drive/MyDrive/models/codebert_mlm_ts")



Epoch,Training Loss,Validation Loss
1,2.9374,1.124397
2,1.0598,0.787674
3,0.8337,0.695208




MLM eval loss: 0.7360420227050781
MLM perplexity: 2.087656244705594


('/content/drive/MyDrive/models/codebert_mlm_ts/tokenizer_config.json',
 '/content/drive/MyDrive/models/codebert_mlm_ts/special_tokens_map.json',
 '/content/drive/MyDrive/models/codebert_mlm_ts/vocab.json',
 '/content/drive/MyDrive/models/codebert_mlm_ts/merges.txt',
 '/content/drive/MyDrive/models/codebert_mlm_ts/added_tokens.json',
 '/content/drive/MyDrive/models/codebert_mlm_ts/tokenizer.json')

In [8]:
df = pd.read_csv("/content/drive/MyDrive/datasets/smells.csv")

label_counts = df["LABEL"].value_counts()
target_size = label_counts.min()

df_balanced = (
    df.groupby("LABEL", group_keys=False)
      .apply(lambda x: resample(
          x,
          replace=False,
          n_samples=target_size,
          random_state=31
      ))
      .sample(frac=1, random_state=31)
      .reset_index(drop=True)
)

train_df, temp_df = train_test_split(
    df_balanced,
    test_size=0.3,
    random_state=31,
    stratify=df_balanced["LABEL"]
)

valid_df, test_df = train_test_split(
    temp_df,
    test_size=2/3,
    random_state=31,
    stratify=temp_df["LABEL"]
)


  .apply(lambda x: resample(


In [9]:
class CodeDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.codes = df["CODE"].astype(str).tolist()
        self.labels = df["LABEL"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.codes[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = CodeDataset(train_df, tokenizer, MAX_LEN)
valid_ds = CodeDataset(valid_df, tokenizer, MAX_LEN)
test_ds  = CodeDataset(test_df,  tokenizer, MAX_LEN)


In [10]:
MODEL_NAME = "/content/drive/MyDrive/models/codebert_mlm_ts"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted", zero_division=0),
        "precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "recall": recall_score(labels, preds, average="weighted", zero_division=0)
    }


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/models/codebert_mlm_ts and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="/content/codebert_output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6689,0.672554,0.625,0.563636,0.785714,0.625
2,0.4983,1.091997,0.59375,0.593353,0.594118,0.59375
3,0.3974,1.153529,0.71875,0.704615,0.770531,0.71875
4,0.2401,1.524891,0.6875,0.686275,0.690476,0.6875
5,0.1429,1.313297,0.75,0.733333,0.833333,0.75
6,0.0935,1.721104,0.71875,0.716256,0.726721,0.71875
7,0.1077,1.729091,0.65625,0.655914,0.656863,0.65625
8,0.0036,1.852244,0.6875,0.686275,0.690476,0.6875
9,0.0009,2.015733,0.6875,0.686275,0.690476,0.6875
10,0.0007,1.830151,0.75,0.740891,0.790909,0.75




TrainOutput(global_step=855, training_loss=0.14374796011163826, metrics={'train_runtime': 27860.8403, 'train_samples_per_second': 0.121, 'train_steps_per_second': 0.031, 'total_flos': 887999811840000.0, 'train_loss': 0.14374796011163826, 'epoch': 15.0})

In [12]:
model.save_pretrained("/content/codebert_trained")
tokenizer.save_pretrained("/content/codebert_trained")

('/content/codebert_trained/tokenizer_config.json',
 '/content/codebert_trained/special_tokens_map.json',
 '/content/codebert_trained/vocab.json',
 '/content/codebert_trained/merges.txt',
 '/content/codebert_trained/added_tokens.json',
 '/content/codebert_trained/tokenizer.json')

In [13]:
predictions = trainer.predict(test_ds)
preds = np.argmax(predictions.predictions, axis=-1)

report = classification_report(
    predictions.label_ids,
    preds,
    target_names=["Clean Code", "Code Smell"],
    output_dict=True,
    zero_division=0
)

df_report = pd.DataFrame(report).transpose()
display(df_report)




Unnamed: 0,precision,recall,f1-score,support
Clean Code,0.884615,0.69697,0.779661,33.0
Code Smell,0.74359,0.90625,0.816901,32.0
accuracy,0.8,0.8,0.8,0.8
macro avg,0.814103,0.80161,0.798281,65.0
weighted avg,0.815187,0.8,0.797995,65.0


In [17]:
import shutil
import os

# Define the folder to zip and the output zip file name
folder_to_zip_output = '/content/codebert_output'
zip_file_name_output = 'codebert_output_archive'

# Check if the folder exists before zipping
if os.path.exists(folder_to_zip_output):
    shutil.make_archive(zip_file_name_output, 'zip', folder_to_zip_output)
    print(f"Folder '{folder_to_zip_output}' compressed into '{zip_file_name_output}.zip'")
else:
    print(f"Folder '{folder_to_zip_output}' does not exist. Skipping zipping.")

Folder '/content/codebert_output' compressed into 'codebert_output_archive.zip'


In [18]:
import shutil
import os

# Define the folder to zip and the output zip file name
folder_to_zip_trained = '/content/codebert_trained'
zip_file_name_trained = 'codebert_trained_archive'

# Check if the folder exists before zipping
if os.path.exists(folder_to_zip_trained):
    shutil.make_archive(zip_file_name_trained, 'zip', folder_to_zip_trained)
    print(f"Folder '{folder_to_zip_trained}' compressed into '{zip_file_name_trained}.zip'")
else:
    print(f"Folder '{folder_to_zip_trained}' does not exist. Skipping zipping.")

Folder '/content/codebert_trained' compressed into 'codebert_trained_archive.zip'
