In [None]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/CODECLONE/train.csv")
test = pd.read_csv("/content/drive/MyDrive/CODECLONE/test.csv")
valid = pd.read_csv("/content/drive/MyDrive/CODECLONE/valid.csv")

In [None]:
import re
def preprocess_java_code(code):
    # 1. Remove single-line and multi-line comments
    code = re.sub(r'//.*?\n|/\*.*?\*/', '', code, flags=re.S)

    # 2. Remove string literals
    code = re.sub(r'"(?:\\.|[^"\\])*"|\'(?:\\.|[^\'\\])*\'', 'STRING_LITERAL', code)

    # 3. Remove numeric literals
    code = re.sub(r'\b\d+(\.\d+)?\b', 'NUMERIC_LITERAL', code)

    # 4. Normalize case
    code = code.lower()

    # 5. Tokenize the code
    tokens = re.findall(r'\w+|[^\w\s]', code)

    # 6. Remove unnecessary whitespace
    processed_code = ' '.join(tokens)

    return processed_code

In [None]:
train['function1'] = train['func_x'].apply(preprocess_java_code)
train['function2'] = train['func_y'].apply(preprocess_java_code)

test['function1'] = test['func_x'].apply(preprocess_java_code)
test['function2'] = test['func_y'].apply(preprocess_java_code)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def embed_code(code, tokenizer, model):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    tokens = tokenizer.encode(code, return_tensors='pt', truncation=True, max_length=512)
    tokens = tokens.to(device)
    embeddings = model(tokens)[0].mean(dim=1).detach().cpu().numpy()
    return embeddings


In [None]:
# Tokenize and embed using CodeBERT
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
model = AutoModel.from_pretrained('microsoft/codebert-base')


train['function1_embedded'] = train['function1'].apply(lambda x: embed_code(x, tokenizer, model))
train['function2_embedded'] = train['function2'].apply(lambda x: embed_code(x, tokenizer, model))
test['function1_embedded'] = test['function1'].apply(lambda x: embed_code(x, tokenizer, model))
test['function2_embedded'] = test['function2'].apply(lambda x: embed_code(x, tokenizer, model))

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
X_train = np.hstack((np.vstack(train['function1_embedded'].values), np.vstack(train['function2_embedded'].values)))
X_test = np.hstack((np.vstack(test['function1_embedded'].values), np.vstack(test['function2_embedded'].values)))

Y_train = train['Label'].values
Y_test = test['Label'].values

In [None]:
# Train an ML model
clf = RandomForestClassifier(random_state=42)  # or clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(X_train, Y_train)

# Evaluate the model
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(Y_test, y_pred))
print("Accuracy: ", accuracy_score(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      1481
           1       0.68      0.84      0.76       209

    accuracy                           0.93      1690
   macro avg       0.83      0.89      0.86      1690
weighted avg       0.94      0.93      0.94      1690

Accuracy:  0.9325443786982248
