# Importing necessary libraries 

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import json
import pandas as pd

# Import the model you would like to fine-tune

In [2]:
model_name = "Salesforce/codegen-350M-multi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.pad_token_id  # Prevents crashing during training



# Import our dataset and combine prompt + response

In [None]:
with open("moreVars.json", "r") as f:
    data = json.load(f)

dataset = pd.DataFrame(data)

dataset["prompts"] = dataset["code"]
dataset["responses"] = dataset["variables"].apply(lambda x: ", ".join(x))  # Convert list to string


combined_texts = [p + "\n" + r for p, r in zip(dataset["prompts"], dataset["responses"])]
hf_dataset = Dataset.from_dict({"text": combined_texts})

# Tokenize data to transform text into numerical format for computational processing.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  

def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.select(range(1500))
train_dataset, eval_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

# Set training arguments and train model on your data!

We want our Training loss and Validation loss to both decrease steadily at the same pace. If our validation loss begins to increase we are at risk of our model overfitting.

In [4]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=50,
    evaluation_strategy="steps",  
    eval_steps=50,               
    save_total_limit=2,            
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", 
    greater_is_better=False   
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,'
    [[aSQ
    eval_daaset=eval_dataset,
          )

trainer.train()

trainer.save_model("./my-fully-trained-model") #File we save our model to
tokenizer.save_pretrained("./my-fully-trained-model")



Map:   0%|          | 0/3300 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
50,0.1204,0.105956
100,0.0919,0.086203
150,0.0856,0.080075
200,0.0825,0.08208
250,0.0794,0.085273
300,0.0792,0.080031
350,0.0789,0.077896
400,0.077,0.076162
450,0.0684,0.077273
500,0.0745,0.077939


('./my-fully-trained-model\\tokenizer_config.json',
 './my-fully-trained-model\\special_tokens_map.json',
 './my-fully-trained-model\\vocab.json',
 './my-fully-trained-model\\merges.txt',
 './my-fully-trained-model\\added_tokens.json',
 './my-fully-trained-model\\tokenizer.json')

# Now we load our model back in and apply some post processing functions to ensure accurate results

In [5]:
model = AutoModelForCausalLM.from_pretrained("./my-fully-trained-model")
tokenizer = AutoTokenizer.from_pretrained("./my-fully-trained-model")

In [6]:
import re

def predict_variables(code_snippet):
    prompt = code_snippet.strip() + "\n"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    outputs = model.generate(
        **inputs,
        max_length=128,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result[len(prompt):].strip()

def extract_variable_names(text):
    # Define a list of Python keywords (you can expand this list)
    python_keywords = {'return', 'def', 'class', 'if', 'else', 'for', 'while', 'try', 'except', 'with', 'as', 'import', 'from', 'in', 'sum', 'None'}
    
    # Use regex to capture valid variable names (a better match for most variable naming conventions)
    variables = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text)
    
    # Filter out Python keywords and common function names
    variables = [var for var in variables if var not in python_keywords]
    
    return list(set(variables))  # Remove duplicates

# Apply the postprocessing function

Here we test our model without post processing

In [27]:
code = "lookup = {'a': 1, 'b': 2}\nstore = [v for v in lookup.values()]"
v
result = predict_variables(code)
print(result)

lookup, store, v


Here we test our model with post processing, showing it stored our values as strings in a list

In [28]:
code = "lookup = {'a': 1, 'b': 2}\nstore = [v for v in lookup.values()]"

result = predict_variables(code)
variables = extract_variable_names(result)
print(variables)  # Should give you only the variable names

['lookup', 'store', 'v']
