In [1]:
import os
import glob

from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import AutoTokenizer
from transformers import AdamW, get_scheduler
from transformers.modeling_outputs import MaskedLMOutput

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

import torch

In [2]:
cd = os.getcwd()

In [3]:
cd

'C:\\Users\\ismai\\OneDrive\\Masaüstü\\Project Adaptation for Code Modeling models'

In [4]:
# git clone https://github.com/ktorio/ktor.git

In [5]:
def find_kotlin_files(root_dir):
    kotlin_files = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.kt'):
                kotlin_files.append(os.path.join(root, file))
    return kotlin_files

In [6]:
def read_kotlin_files(kotlin_files):
    kotlin_code = []
    for file in kotlin_files:
        with open(file, 'r', encoding='utf-8') as f:
            code = f.read()
            kotlin_code.append(code)
    return kotlin_code

In [7]:
# Define the root directory of the cloned Ktor repository
ktor_repo_dir = cd + '/ktor'

In [8]:
ktor_repo_dir

'C:\\Users\\ismai\\OneDrive\\Masaüstü\\Project Adaptation for Code Modeling models/ktor'

In [9]:
# Find Kotlin files in the Ktor project directory
kotlin_files = find_kotlin_files(ktor_repo_dir)

In [10]:
len(kotlin_files)

1971

In [11]:
# Read the contents of Kotlin files
kotlin_code = read_kotlin_files(kotlin_files)

In [12]:
# Print the number of Kotlin files found
print("Number of Kotlin files found:", len(kotlin_files))

# Print the first 500 characters of the first Kotlin file as a sample
print("\nSample Kotlin code:")
print(kotlin_code[0][:500])

Number of Kotlin files found: 1971

Sample Kotlin code:
/*
 * Copyright 2014-2021 JetBrains s.r.o and contributors. Use of this source code is governed by the Apache 2.0 license.
 */
import org.gradle.api.*
import org.gradle.api.tasks.*
import org.gradle.kotlin.dsl.*
import org.jmailen.gradle.kotlinter.tasks.*

fun Project.configureCodestyle() {
    apply(plugin = "org.jmailen.kotlinter")

    kotlinter.apply {
        ignoreFailures = true
        reporters = arrayOf("checkstyle", "plain")
    }

    val editorconfigFile = rootProject.file(".editorc


In [13]:
# %pip install transformers torch

In [14]:
# Define the model name or path for CodeBERT
model_name = "microsoft/codebert-base"

In [15]:
# Load the pre-trained CodeBERT model
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Load the tokenizer associated with the CodeBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
# Tokenize the Kotlin code
tokenized_kotlin_code = tokenizer(kotlin_code, padding=True, truncation='longest_first', return_tensors="pt", max_length=512)
print("Tokenized input shape:", tokenized_kotlin_code.input_ids.shape)

Tokenized input shape: torch.Size([1971, 512])


In [18]:
# Print out tokenized sequences for inspection
for i, encoding in enumerate(tokenized_kotlin_code.encodings):
    print(f"Tokenized sequence {i + 1}:")
    print(encoding.tokens)
    if i == 1:
        break

Tokenized sequence 1:
['<s>', '/*', 'Ċ', 'Ġ*', 'ĠCopyright', 'Ġ2014', '-', '20', '21', 'ĠJet', 'Br', 'ains', 'Ġs', '.', 'r', '.', 'o', 'Ġand', 'Ġcontributors', '.', 'ĠUse', 'Ġof', 'Ġthis', 'Ġsource', 'Ġcode', 'Ġis', 'Ġgoverned', 'Ġby', 'Ġthe', 'ĠApache', 'Ġ2', '.', '0', 'Ġlicense', '.', 'Ċ', 'Ġ*/', 'Ċ', 'import', 'Ġorg', '.', 'grad', 'le', '.', 'api', '.*', 'Ċ', 'import', 'Ġorg', '.', 'grad', 'le', '.', 'api', '.', 't', 'asks', '.*', 'Ċ', 'import', 'Ġorg', '.', 'grad', 'le', '.', 'k', 'ot', 'lin', '.', 'd', 'sl', '.*', 'Ċ', 'import', 'Ġorg', '.', 'j', 'mail', 'en', '.', 'grad', 'le', '.', 'k', 'ot', 'lin', 'ter', '.', 't', 'asks', '.*', 'Ċ', 'Ċ', 'fun', 'ĠProject', '.', 'config', 'ure', 'Cod', 'estyle', '()', 'Ġ{', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġapply', '(', 'plugin', 'Ġ=', 'Ġ"', 'org', '.', 'j', 'mail', 'en', '.', 'k', 'ot', 'lin', 'ter', '")', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġk', 'ot', 'lin', 'ter', '.', 'apply', 'Ġ{', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġignore', 'Fail', 'ures', 'Ġ=', 'Ġtrue', 'Ċ

In [19]:
# Shuffle the dataset
# kotlin_code_shuffled, _ = train_test_split(kotlin_code, test_size=0.01, random_state=42)

In [20]:
# Split the shuffled dataset into training, validation, and test sets
train_size = 0.6
val_test_size = 0.2
train_kotlin_code, temp_kotlin_code = train_test_split(kotlin_code, train_size=train_size, random_state=42)
val_kotlin_code, test_kotlin_code = train_test_split(temp_kotlin_code, test_size=val_test_size, random_state=42)

In [21]:
# Print the sizes of the training, validation, and test sets
print("Training set size:", len(train_kotlin_code))
print("Validation set size:", len(val_kotlin_code))
print("Test set size:", len(test_kotlin_code))

Training set size: 1182
Validation set size: 631
Test set size: 158


In [22]:
class KotlinDataset(Dataset):
    def __init__(self, tokenized_kotlin_code):
        self.tokenized_kotlin_code = tokenized_kotlin_code

    def __len__(self):
        return len(self.tokenized_kotlin_code)

    def __getitem__(self, idx):
        encoding = self.tokenized_kotlin_code[idx]
        input_ids = encoding.ids
        attention_mask = encoding.attention_mask
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask)
        }

In [23]:
# Create datasets for training, validation, and test
train_dataset = KotlinDataset(tokenized_kotlin_code[:len(train_kotlin_code)])
val_dataset = KotlinDataset(tokenized_kotlin_code[len(train_kotlin_code):len(train_kotlin_code) + len(val_kotlin_code)])
test_dataset = KotlinDataset(tokenized_kotlin_code[len(train_kotlin_code) + len(val_kotlin_code):])

In [25]:
batch_size = 8

In [26]:
# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [27]:
# Fine-tuning configuration
learning_rate = 5e-4
epochs = 3

In [28]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Define the scheduler
num_training_steps = len(train_dataloader) * epochs
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 10% of training steps for warmup
    num_training_steps=num_training_steps
)



In [None]:
# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model in training mode
model.train()

# Training loop
for epoch in range(epochs):
    total_loss = 0.0

    # Inside the training loop
    for batch in train_dataloader:
        # Move batch to the appropriate device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**batch)

        # Check if 'loss' attribute exists
        if hasattr(outputs, "loss") and outputs.loss is not None:
            loss = outputs.loss
            # Backward pass
            loss.backward()
            # Update parameters and learning rate
            optimizer.step()
            scheduler.step()

            # Accumulate loss
            total_loss += loss.item()

    # Calculate average loss for the epoch
    avg_train_loss = total_loss / len(train_dataloader)

        # Validation loop
    val_total_loss = 0.0
    model.eval()
    with torch.no_grad():
        for val_batch in val_dataloader:
            val_batch = {k: v.to(device) for k, v in val_batch.items()}
            val_outputs = model(**val_batch)
            if hasattr(val_outputs, "loss") and val_outputs.loss is not None:
                val_loss = val_outputs.loss
                val_total_loss += val_loss.item()
    # Calculate average validation loss for the epoch
    avg_val_loss = val_total_loss / len(val_dataloader)
    
    # Print average loss for the epoch
    print(f"Epoch {epoch + 1}: Average training loss {avg_train_loss:.4f}, Average validation loss {avg_val_loss:.4f}")


In [None]:
print("a")

In [29]:
# Save the model state dictionary
torch.save(model.state_dict(), 'model_state.pth')

In [30]:
# Initialize a new model instance
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Load the saved model state dictionary
model.load_state_dict(torch.load('model_state.pth'))

<All keys matched successfully>