In [1]:
!pip install transformers langchain datasets torch scikit-learn pandas

Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
INFO: pip is looking at multiple versions of datasets to determine which version is compatible with other requirements. This could take a while.
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.49.0
    Uninstalling tqdm-4.49.0:
      Successfully uninstalled tqdm-4.49.0
  Attempting uninstall: datasets
    Found existing installation: datasets 1.12.1
    Uninstalling datasets-1.12.1:
      Successfully uninstalled datasets-1.12.1
Successfully installed datasets-3.2.0 tqdm-4.67.1


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import torch


In [3]:
file_path = 'Shuffled_Formatted_Code_Dataset.csv'
data = pd.read_csv(file_path)

class CodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        inputs = self.tokenizer(
            row['code'],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        label = torch.tensor(row['label'], dtype=torch.long)
        return inputs, label

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
max_length = 512

test_dataset = CodeDataset(data, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

2024-12-27 20:54:49.642245: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735325689.663989  185214 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735325689.670570  185214 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-27 20:54:49.692787: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base an

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            input_ids = inputs['input_ids'].squeeze(1).to(device)
            attention_mask = inputs['attention_mask'].squeeze(1).to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=["Human", "AI"])
    return accuracy, report


In [6]:
accuracy_before, report_before = evaluate_model(model, test_loader)
print("Performance Before Prompt Engineering:")
print(f"Accuracy: {accuracy_before}")
print(f"Classification Report:\n{report_before}")


Performance Before Prompt Engineering:
Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

       Human       0.50      1.00      0.67       126
          AI       0.00      0.00      0.00       126

    accuracy                           0.50       252
   macro avg       0.25      0.50      0.33       252
weighted avg       0.25      0.50      0.33       252



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
from langchain.prompts import PromptTemplate, FewShotPromptTemplate

# Escaped examples for Few-Shot Prompt
examples = [
    {
        "language": "Java",
        "code": """// This function reverses a string based on a given condition
class Solution {{
    public String finalString(String s) {{
        StringBuilder nm = new StringBuilder();
        for (char c : s.toCharArray()) {{
            if (c == 'i') {{
                nm.reverse();
            }} else {{
                nm.append(c);
            }}
        }}
        return nm.toString();
    }}
}}""",
        "label": "Human"
    },
    {
        "language": "Java",
        "code": """// AI-Generated: This function finds the minimum absolute difference
public class Solution {{
    public int minAbsoluteDifference(List<Integer> nums, int x) {{
        TreeMap<Integer, Integer> map = new TreeMap<>();
        map.put(nums.get(0), 0);
        int n = nums.size();
        int minDiff = Integer.MAX_VALUE;

        for (int i = 1; i < n; i++) {{
            if (i >= x) {{
                map.remove(nums.get(i - x));
            }}
            Integer lower = map.floorKey(nums.get(i));
            Integer higher = map.ceilingKey(nums.get(i));
            if (lower != null) {{
                minDiff = Math.min(minDiff, Math.abs(nums.get(i) - lower));
            }}
            if (higher != null) {{
                minDiff = Math.min(minDiff, Math.abs(nums.get(i) - higher));
            }}
            map.put(nums.get(i), i);
        }}
        return minDiff;
    }}
}}""",
        "label": "AI"
    }
]

# Prompt Template
example_prompt = PromptTemplate(
    input_variables=["language", "code", "label"],
    template="Language: {language}\nCode:\n{code}\nLabel: {label}\n"
)

# Few-Shot Prompt Template
few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Language: {language}\nCode:\n{code}\nLabel:",
    input_variables=["language", "code"]
)


In [42]:
class PromptedCodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        inputs = self.tokenizer(
            few_shot_prompt.format(language=row['language'], code=row['code']),
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        label = torch.tensor(row['label'], dtype=torch.long)
        return inputs, label

prompted_test_dataset = PromptedCodeDataset(data, tokenizer, max_length)
prompted_test_loader = DataLoader(prompted_test_dataset, batch_size=8, shuffle=False)


In [43]:
accuracy_after, report_after = evaluate_model(model, prompted_test_loader)
print("Performance After Prompt Engineering:")
print(f"Accuracy: {accuracy_after}")
print(f"Classification Report:\n{report_after}")


Performance After Prompt Engineering:
Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

       Human       0.50      1.00      0.67       126
          AI       0.00      0.00      0.00       126

    accuracy                           0.50       252
   macro avg       0.25      0.50      0.33       252
weighted avg       0.25      0.50      0.33       252



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
