Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
import torch
from transformers import RobertaForMaskedLM, RobertaTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from tqdm import tqdm

# Assuming you've already defined and loaded the model and tokenizer
model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')

# Prepare your data
code_snippet = "def add(x, y): return x MASK y"
label = "def add(x, y): return x MASK y"  # Updated label

# Tokenization
inputs = tokenizer.encode_plus(
    code_snippet,
    add_special_tokens=False,
    return_tensors="pt"
)

# Tokenize the label
label_tokens = tokenizer.encode_plus(
    label,
    add_special_tokens=False,
    return_tensors="pt"
)["input_ids"]

# Define Dataset and DataLoader
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], label_tokens)
dataloader = DataLoader(dataset, batch_size=1)

# Set model to training mode
model.train()

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for input_ids, attention_mask, label_tokens in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_tokens)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {epoch_loss / len(dataloader)}")


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1/3: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]


Epoch 1 Loss: 0.9016540050506592


Epoch 2/3: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


Epoch 2 Loss: 0.34876132011413574


Epoch 3/3: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]

Epoch 3 Loss: 0.34836244583129883





In [38]:
# save the model as CodeCommentsTest


model.save_pretrained("model_directory")
tokenizer.save_pretrained("model_directory")


('model_directory/tokenizer_config.json',
 'model_directory/special_tokens_map.json',
 'model_directory/vocab.json',
 'model_directory/merges.txt',
 'model_directory/added_tokens.json')

In [42]:
# load that model and tokenizer and test it



model = RobertaForMaskedLM.from_pretrained("model_directory")

tokenizer = RobertaTokenizer.from_pretrained("model_directory")

# Define your code snippet with the masked token
code_snippet = "def add(x, y): return x <mask> y"

# Tokenize the code snippet
input_ids = tokenizer.encode(code_snippet, return_tensors="pt")

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits[0, input_ids[0].tolist().index(tokenizer.mask_token_id)].argmax().item()

# Decode the predicted token
predicted_token = tokenizer.decode(predictions)

print("Predicted token:", predicted_token)




Predicted token:  +


In [24]:
#  load roberta base and save it

from transformers import RobertaModel, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('microsoft/unixcoder-base')
model = RobertaModel.from_pretrained('microsoft/unixcoder-base')

model.save_pretrained("UniXCoderBase")

In [43]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline, AutoTokenizer

# import the roberta model
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")

# use the model
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)
output = unmasker("Hello I'm a <mask> model.")
print(output)

# save the model
model.save_pretrained("Code")
# save the tokenizer
tokenizer.save_pretrained("Code")

[{'score': 0.33065104484558105, 'token': 2943, 'token_str': ' male', 'sequence': "Hello I'm a male model."}, {'score': 0.04655442014336586, 'token': 2182, 'token_str': ' female', 'sequence': "Hello I'm a female model."}, {'score': 0.04232990741729736, 'token': 2038, 'token_str': ' professional', 'sequence': "Hello I'm a professional model."}, {'score': 0.03721687197685242, 'token': 2734, 'token_str': ' fashion', 'sequence': "Hello I'm a fashion model."}, {'score': 0.03253675624728203, 'token': 1083, 'token_str': ' Russian', 'sequence': "Hello I'm a Russian model."}]


('Code/tokenizer_config.json',
 'Code/special_tokens_map.json',
 'Code/vocab.json',
 'Code/merges.txt',
 'Code/added_tokens.json',
 'Code/tokenizer.json')

In [44]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline, AutoTokenizer

# import the roberta model
tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
model = RobertaForMaskedLM.from_pretrained("huggingface/CodeBERTa-small-v1")

# use the model
fill_mask = pipeline(
    "fill-mask",
    model="huggingface/CodeBERTa-small-v1",
    tokenizer="huggingface/CodeBERTa-small-v1"
)
output = fill_mask("def add(a, b): return a <mask> b")
print(output)

# save the model
# model.save_pretrained("CodeCommentsAST")
# save the tokenizer
tokenizer.save_pretrained("CodeCommentsAST")

Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializ

[{'score': 0.3511253893375397, 'token': 16, 'token_str': ',', 'sequence': 'def add(a, b): return a, b'}, {'score': 0.17389361560344696, 'token': 424, 'token_str': ' +', 'sequence': 'def add(a, b): return a + b'}, {'score': 0.0844004675745964, 'token': 397, 'token_str': ' *', 'sequence': 'def add(a, b): return a * b'}, {'score': 0.08336357027292252, 'token': 317, 'token_str': ' if', 'sequence': 'def add(a, b): return a if b'}, {'score': 0.051392022520303726, 'token': 608, 'token_str': ' or', 'sequence': 'def add(a, b): return a or b'}]


('CodeCommentsAST/tokenizer_config.json',
 'CodeCommentsAST/special_tokens_map.json',
 'CodeCommentsAST/vocab.json',
 'CodeCommentsAST/merges.txt',
 'CodeCommentsAST/added_tokens.json',
 'CodeCommentsAST/tokenizer.json')

In [6]:
model.save_pretrained("CodeCommentsAST")

In [45]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
# save tokenize
tokenizer.save_pretrained("CodeComments")

('CodeComments/tokenizer_config.json',
 'CodeComments/special_tokens_map.json',
 'CodeComments/vocab.json',
 'CodeComments/merges.txt',
 'CodeComments/added_tokens.json')

In [5]:
# load the code model
from transformers import RobertaForMaskedLM, RobertaTokenizer

model = RobertaForMaskedLM.from_pretrained("huggingface/CodeBERTa-small-v1")
tokenizer = RobertaTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")

# Define your code snippet with the masked token
code_snippet = "def add(x, y): return x <mask> y"

# Tokenize the code snippet
input_ids = tokenizer.encode(code_snippet, return_tensors="pt")

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits[0, input_ids[0].tolist().index(tokenizer.mask_token_id)].argmax().item()

# Decode the predicted token
predicted_token = tokenizer.decode(predictions)

print("Predicted token:", predicted_token)


Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted token: ,


In [6]:
# save the codecommentsast model
model.save_pretrained("CodeCommentsAST")
# save the tokenizer
tokenizer.save_pretrained("CodeCommentsAST")


('CodeCommentsAST/tokenizer_config.json',
 'CodeCommentsAST/special_tokens_map.json',
 'CodeCommentsAST/vocab.json',
 'CodeCommentsAST/merges.txt',
 'CodeCommentsAST/added_tokens.json')

In [7]:
import torch 


# load the ast ast model
from transformers import RobertaForMaskedLM, RobertaTokenizer

model = RobertaForMaskedLM.from_pretrained("CodeCommentsAST")
tokenizer = RobertaTokenizer.from_pretrained("CodeCommentsAST")

# Define your code snippet with the masked token
code_snippet = "def add(x, y): return x <mask> y"

# Tokenize the code snippet
input_ids = tokenizer.encode(code_snippet, return_tensors="pt")

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits[0, input_ids[0].tolist().index(tokenizer.mask_token_id)].argmax().item()

# Decode the predicted token
predicted_token = tokenizer.decode(predictions)

print("Predicted token:", predicted_token)



Predicted token: ,


In [9]:
import esprima

# JavaScript code as a string
javascript_code = """
function createTypeScriptLanguageService(options) {
    // Discover referenced files
    const FILES = discoverAndReadFiles(options);
    // Add fake usage files
    options.inlineEntryPoints.forEach((inlineEntryPoint, index) => {
        FILES[`${index}`] = inlineEntryPoint;
    });
    // Add additional typings
    options.typings.forEach((typing) => {
        const filePath = path.join(options.sourcesRoot, typing);
        FILES[typing] = fs.readFileSync(filePath).toString();
    });
    // Resolve libs
    const RESOLVED_LIBS = {};
    options.libs.forEach((filename) => {
        const filepath = path.join(TYPESCRIPT_LIB_FOLDER, filename);
        RESOLVED_LIBS[`${filename}`] = fs.readFileSync(filepath).toString();
    });
    const compilerOptions = ts.convertCompilerOptionsFromJson(options.compilerOptions, options.sourcesRoot).options;
    const host = new TypeScriptLanguageServiceHost(RESOLVED_LIBS, FILES, compilerOptions);
    return ts.createLanguageService(host);
}
"""

# Parse the JavaScript code into an AST
ast_tree = esprima.parseScript(javascript_code)

# Print the AST
print(ast_tree)

{
    type: "Program",
    sourceType: "script",
    body: [
        {
            type: "FunctionDeclaration",
            expression: False,
            isAsync: False,
            id: {
                type: "Identifier",
                name: "createTypeScriptLanguageService"
            },
            params: [
                {
                    type: "Identifier",
                    name: "options"
                }
            ],
            body: {
                type: "BlockStatement",
                body: [
                    {
                        type: "VariableDeclaration",
                        declarations: [
                            {
                                type: "VariableDeclarator",
                                id: {
                                    type: "Identifier",
                                    name: "FILES"
                                },
                                init: {
                                    type: "Call