In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['WANDB_DISABLED'] = "true"

In [2]:
!pip install -U -q transformers accelerate bitsandbytes optimum gdown

In [3]:
!gdown 1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp

Downloading...
From (original): https://drive.google.com/uc?id=1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp
From (redirected): https://drive.google.com/uc?id=1OjBBLcPOK4XysDuhU57TrBAKMlzJGrEp&confirm=t&uuid=d64287c9-e9a2-4899-983d-c81acf2a5e3e
To: /content/prepared_dataset.zip
100% 181M/181M [00:01<00:00, 96.5MB/s]


In [4]:
!yes | unzip -q prepared_dataset.zip

replace prepared_dataset/dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source11814.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source28574.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source13400.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source7303.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source27287.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source16068.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source33932.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source30758.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source36694.java? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace prepared_dataset/output_code/source32310.java? [y]es, [n]o, [A]ll, [N]one, [r]en

In [3]:
MODEL_NAME="IIIT-L/xlm-roberta-large-finetuned-code-mixed-DS"
MODEL_NAME="FacebookAI/xlm-roberta-large"
MODEL_NAME="HuggingFaceTB/SmolLM-1.7B"
MODEL_NAME="Groq/Llama-3-Groq-8B-Tool-Use"

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM,pipeline
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
import os
import gc

In [5]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

In [7]:
df = pd.read_csv('prepared_dataset/dataset.csv', header=None, names=['file_path', 'codesmells'])
df.rename(columns={'codesmells': 'labels'}, inplace=True)
df['labels'] = df['labels'].apply(lambda x: x.split(','))

In [8]:
all_labels = set(label for sublist in df['labels'] for label in sublist)
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}
id2label = {i: label for label, i in label_to_idx.items()}

In [9]:
def encode_labels(labels):
    encoded = [0] * len(label_to_idx)
    for label in labels:
        encoded[label_to_idx[label]] = 1
    return encoded

In [10]:
df['encoded_labels'] = df['labels'].apply(encode_labels)

In [11]:
df = df.iloc[1:] # skip first row

In [12]:
df=df.sample(frac=1, random_state=42)

In [13]:
df=df.sample(1000,random_state=42)

In [14]:
class CodeDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        code_path = os.path.join("prepared_dataset","output_code",self.dataframe.iloc[idx]['file_path'])
        with open(code_path, 'r') as file:
            code = file.read()
        labels = torch.tensor(self.dataframe.iloc[idx]['encoded_labels'], dtype=torch.float).to(device)
        return {'code':code, 'labels': labels}

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [17]:
model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto",
)

In [18]:
dataset = CodeDataset(df)

In [19]:
gen = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [20]:
system_prompt=f"""you are an agent should output the correct code smells for multi-classification problem.
these are the classes we have:
{label_to_idx}"""

In [21]:
system_prompt

"you are an agent should output the correct code smells for multi-classification problem.\nthese are the classes we have:\n{'Missing Hierarchy': 0, 'Long Parameter List': 1, 'codesmells': 2, 'Unnecessary Abstraction': 3, 'Imperative Abstraction': 4, 'Empty catch clause': 5, 'Deficient Encapsulation': 6, 'Long Identifier': 7, 'Multifaceted Abstraction': 8, 'Wide Hierarchy': 9, 'Complex Conditional': 10, 'Rebellious Hierarchy': 11, 'Magic Number': 12, 'Missing default': 13, 'Long Method': 14, 'Broken Modularization': 15, 'Broken Hierarchy': 16, 'Unutilized Abstraction': 17, 'Long Statement': 18, 'Cyclic-Dependent Modularization': 19, 'Multipath Hierarchy': 20, 'Deep Hierarchy': 21, 'Hub-like Modularization': 22, 'Insufficient Modularization': 23, 'Cyclic Hierarchy': 24, 'Unexploited Encapsulation': 25, 'Abstract Function Call From Constructor': 26, 'Complex Method': 27}"

In [22]:
def generate_user_prompt(code,oneshot_code,oneshot_labels):
  user_prompt = f"""
  output an array with the correct classes as 1 comma separated for the code below:
  code1:
  ```
  {oneshot_code}
  ```
  labels1:
  {oneshot_labels}
  code2:
  ```
  {code}
  ```
  labels2:
  """
  return user_prompt

In [23]:
prompts=[]
for i,entry in tqdm(enumerate(dataset)):
  last_entry=dataset[i-1]
  user_prompt=generate_user_prompt(entry['code'],last_entry['code'],dataset[i-1]["labels"])
  messages=[
  {"role": "system", "content": system_prompt},
  {"role": "user", "content": user_prompt}
  ]
  prompts.append(messages)

1000it [00:02, 464.07it/s]


In [24]:
generation_args = {
    "max_new_tokens": 100,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}
batch_size = 4

In [25]:
prompts_to_gen=[]
count=0
while count<10:
  for i in range(len(prompts)):
    if len(prompts[i][0]["content"])+len(prompts[i][1]["content"])<2000:
      prompts_to_gen.append(prompts[i])
      count+=1

In [26]:
inference_results=gen(prompts_to_gen, **generation_args)



In [28]:
# inference in batch
# inference_results=[]
# for i in tqdm(range(0, len(prompts[:128]), batch_size)):
#     results=gen(prompts[i:i + batch_size], **generation_args)
#     inference_results.extend(results)
#     torch.cuda.empty_cache()  # Clear cache after processing each batch

In [29]:
dataset[3]["labels"]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')

In [30]:
inference_results[3]

[{'generated_text': 'Based on the provided code, the correct classes are:\n- Missing Hierarchy: 0\n- Long Parameter List: 1\n- codesmells: 2\n- Unnecessary Abstraction: 3\n- Imperative Abstraction: 4\n- Empty catch clause: 5\n- Deficient Encapsulation: 6\n- Long Identifier: 7\n- Multifaceted Abstraction: 8\n- Wide Hierarchy: 9\n- Complex Conditional:'}]

In [32]:
def parse_generated_text(generated_text):
    # Extract class names from the generated text
    lines = generated_text.split('\n')
    class_names = [line.split('. ')[1].strip("'") for line in lines if '. ' in line]
    # Convert class names to indices
    class_indices = [label_to_idx[name] for name in class_names if name in label_to_idx]
    return class_indices

def compute_individual_metrics(preds, labels):
    # Accuracy
    preds = preds.cpu()
    labels = labels.cpu()
    accuracy = (preds == labels).float().mean().item()

    # Precision, Recall, F1 Score
    true_positive = (preds * labels).sum().float()
    predicted_positive = preds.sum().float()
    actual_positive = labels.sum().float()

    # Adding a small epsilon to avoid division by zero
    epsilon = 1e-7

    precision = (true_positive / (predicted_positive + epsilon)).item()
    recall = (true_positive / (actual_positive + epsilon)).item()
    f1_score = (2 * precision * recall / (precision + recall + epsilon))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

def compute_overall_metrics(inference_results, ground_truths):
    # Initialize accumulators for metrics
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1_score = 0

    for i, result in enumerate(inference_results):
        # Parse the generated text to get predicted class indices
        generated_text = result[0]['generated_text']
        predicted_indices = parse_generated_text(generated_text)

        # Create a binary tensor for predictions
        preds = torch.zeros(len(label_to_idx), dtype=torch.int)
        preds[predicted_indices] = 1

        # Ground truth labels
        labels = torch.tensor(ground_truths[i]["labels"], dtype=torch.int)

        # Compute metrics for this instance
        metrics = compute_individual_metrics(preds, labels)

        # Accumulate metrics
        total_accuracy += metrics["accuracy"]
        total_precision += metrics["precision"]
        total_recall += metrics["recall"]
        total_f1_score += metrics["f1_score"]

    # Average metrics over all instances
    num_instances = len(inference_results)
    overall_metrics = {
        "accuracy": total_accuracy / num_instances,
        "precision": total_precision / num_instances,
        "recall": total_recall / num_instances,
        "f1_score": total_f1_score / num_instances
    }

    return overall_metrics


In [33]:
metrics = compute_overall_metrics(inference_results, dataset)
print(metrics)

{'accuracy': 0.8883928681413332, 'precision': 0.008333333457509676, 'recall': 0.04166666169961294, 'f1_score': 0.013888887628001154}


  labels = torch.tensor(ground_truths[i]["labels"], dtype=torch.int)
