In [1]:
BATCH_SIZE=8
MAX_LEN=3000
NUM_EPOCHS=1

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['WANDB_DISABLED'] = "true"

In [3]:
!pip install -qU transformers accelerate evaluate gdown
!pip install -q auto-gptq optimum bitsandbytes

In [4]:
!gdown 1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp

Downloading...
From (original): https://drive.google.com/uc?id=1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp
From (redirected): https://drive.google.com/uc?id=1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp&confirm=t&uuid=572bc236-0e21-4c17-b882-a7886d876efc
To: /kaggle/working/prepared_dataset.zip
100%|████████████████████████████████████████| 181M/181M [00:02<00:00, 63.5MB/s]


In [5]:
!yes | unzip -q prepared_dataset.zip

yes: standard output: Broken pipe


In [6]:
MODEL_NAME="IIIT-L/xlm-roberta-large-finetuned-code-mixed-DS"
MODEL_NAME="FacebookAI/xlm-roberta-large"
MODEL_NAME="HuggingFaceTB/SmolLM-1.7B"
# MODEL_NAME="Groq/Llama-3-Groq-8B-Tool-Use"

In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from torch.nn import BCEWithLogitsLoss
import os
import gc

In [8]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

In [10]:
df = pd.read_csv('prepared_dataset/dataset.csv', header=None, names=['file_path', 'codesmells'])
df.rename(columns={'codesmells': 'labels'}, inplace=True)
df['labels'] = df['labels'].apply(lambda x: x.split(','))

In [11]:
all_labels = set(label for sublist in df['labels'] for label in sublist)
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}

# Prepare the labels
# all_labels = sorted(list(set([label for labels in df['labels'] for label in labels])))
# label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label_to_idx.items()}

In [12]:
def encode_labels(labels):
    encoded = [0] * len(label_to_idx)
    for label in labels:
        encoded[label_to_idx[label]] = 1
    return encoded

In [13]:
df['encoded_labels'] = df['labels'].apply(encode_labels)

In [14]:
df=df.sample(frac=1, random_state=42)
df=df.sample(1000, random_state=42)

In [15]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

In [16]:
class CodeDataset(Dataset):
    def __init__(self, dataframe,tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        code_path = os.path.join("prepared_dataset","output_code",self.dataframe.iloc[idx]['file_path'])
        with open(code_path, 'r') as file:
            code = file.read()
        labels = torch.tensor(self.dataframe.iloc[idx]['encoded_labels'], dtype=torch.float).to(device)
        inputs = self.tokenizer(code, return_tensors='pt', truncation=True,padding='max_length', max_length = MAX_LEN
                                ,add_special_tokens = True).to(device)#,padding=True

        #squeeze inputs:
        inputs = {key: val.squeeze() for key, val in inputs.items()}
        return {**inputs, 'labels': labels}

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

In [18]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels=len(all_labels),
#                                                            ignore_mismatched_sizes=True,
                                                           quantization_config=quantization_config,
                                                           problem_type="multi_label_classification",
                                                           low_cpu_mem_usage=True
                                                          )

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-1.7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.config.pad_token_id = model.config.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer))
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.pad_token

'[PAD]'

In [21]:
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model,PeftModel

In [22]:
# model = PeftModel.from_pretrained(model=model, model_id="mspoulaei/Code_Smell_Detection_SmolLM")

adapter_config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/406M [00:00<?, ?B/s]

In [None]:
model.train() # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

In [None]:
# LoRA config
# modules_to_save = ["lm_head", "embed_tokens"]
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
#     modules_to_save=modules_to_save
)
# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="SEQ_CLS"
# )

# LoRA trainable version of model
model = get_peft_model(model, config)

# trainable parameter count
model.print_trainable_parameters()

In [24]:
train_dataset = CodeDataset(train_df,tokenizer)
test_dataset = CodeDataset(test_df,tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    warmup_steps=500,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    # logging_dir='./logss',
    # logging_steps=10,
    gradient_accumulation_steps=4,
    # warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [39]:
# def compute_metrics(p):
#     preds = torch.sigmoid(torch.tensor(p.predictions))
#     preds = (preds > 0.5).int()
#     labels = torch.tensor(p.label_ids)
#     accuracy = (preds == labels).float().mean().item()
#     return {"accuracy": accuracy}
def compute_metrics(p):
    # Convert predictions to sigmoid and then to binary
    preds = torch.sigmoid(torch.tensor(p.predictions))
    preds = (preds > 0.5).int()
    labels = torch.tensor(p.label_ids)

    # Accuracy
    accuracy = (preds == labels).float().mean().item()

    # Precision, Recall, F1 Score
    true_positive = (preds * labels).sum(dim=0).float()
    predicted_positive = preds.sum(dim=0).float()
    actual_positive = labels.sum(dim=0).float()

    # Adding a small epsilon to avoid division by zero
    epsilon = 1e-7

    precision = (true_positive / (predicted_positive + epsilon)).mean().item()
    recall = (true_positive / (actual_positive + epsilon)).mean().item()
    f1_score = (2 * precision * recall / (precision + recall + epsilon))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

In [40]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    # data_collator=data_collator,
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest', max_length=MAX_LEN),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
model.config.use_cache = True

In [41]:
model.config.use_cache = True # in case of errors

In [42]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.7464185953140259, 'eval_model_preparation_time': 0.0036, 'eval_accuracy': 0.5571428537368774, 'eval_precision': 0.06620120257139206, 'eval_recall': 0.29049187898635864, 'eval_f1_score': 0.10782887209749839, 'eval_runtime': 189.0942, 'eval_samples_per_second': 0.529, 'eval_steps_per_second': 0.069}


In [None]:
a=2+'2'

In [None]:
!sudo apt-get install git-lfs
!git config --global credential.helper store

In [None]:
from huggingface_hub import login
login()

In [None]:
!yes | huggingface-cli repo create Code_Smell_Detection_SmolLM

In [None]:
!git lfs install

!git clone https://huggingface.co/mspoulaei/Code_Smell_Detection_SmolLM

%cd Code_Smell_Detection_SmolLM
!git config --global user.email "sadeghpoolaee@gmail.com"
# Tip: using the same email than for your huggingface.co account will link your commits to your profile
!git config --global user.name "mspoulaei"

In [None]:
model.save_pretrained("./")
tokenizer.save_pretrained("./")

In [None]:
!git remote set-url origin https://huggingface.co/mspoulaei/Code_Smell_Detection_SmolLM

In [None]:
!git add .
!git commit -m "Save model and tokenizer"

In [None]:
!git push

In [None]:
a='2'+2#error

In [None]:
del model

In [None]:
del tokenizer

In [None]:
del trainer

In [None]:
flush()

In [None]:
# !pip install numba

from numba import cuda


In [None]:
device = cuda.get_current_device()
device.reset()