# GPT2 MODEL

Installing the dependencies

In [None]:
%pip install transformers torch pandas scikit-learn datasets torch accelerate evaluate

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------- ----- 51.2/59.0 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 59.0/59.0 kB 1.0 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.3 MB 2.6 MB/s eta 0:00:04
   -- ------------------------------------- 0.6/9.3 MB 6.0 MB/s eta 0:00:02
   ------- -------------------------------- 1.

Load the CodeSearchNet dataset

In [None]:
from datasets import load_dataset, load_metric

# Load the CodeSearchNet dataset
dataset = load_dataset("code_x_glue_ct_code_to_text", "javascript")
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]


# Sample a subset of the data for quick training/testing
train_data = train_data.shuffle(seed=42).select(range(20000))
val_data = val_data.shuffle(seed=42).select(range(1500))

Initialize the tokenizer and model

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    code = examples['code']
    docstring = examples['docstring']
    inputs = tokenizer(code, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(docstring, max_length=128, truncation=True, padding="max_length")
    inputs['labels'] = labels['input_ids']
    return inputs

# Tokenize the training and validation datasets
tokenized_train_data = train_data.map(tokenize_function, batched=True, remove_columns=['repo', 'path', 'func_name', 'original_string', 'code_tokens', 'docstring_tokens'])
tokenized_val_data = val_data.map(tokenize_function, batched=True, remove_columns=['repo', 'path', 'func_name', 'original_string', 'code_tokens', 'docstring_tokens'])


In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import pandas as pd



# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./GPT2_20000',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=4,
    #warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./GPT2_20000',
    logging_steps=10,
    save_total_limit=2,
    #load_best_model_at_end=True,
    #metric_for_best_model=True,
    #greater_is_better=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # resume_from_checkpoint=True,
)

# Move model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# # Load the ROUGE metric
# rouge = load_metric("rouge")

# # Function to compute the metric
# def compute_metrics(pred):
#     labels_ids = pred.label_ids
#     pred_ids = pred.predictions

#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

#     # Compute the metric
#     rouge_output = rouge.compute(predictions=pred_str, references=labels_str)
#     return {
#         "rouge1": rouge_output["rouge1"].mid.fmeasure,
#         "rouge2": rouge_output["rouge2"].mid.fmeasure,
#         "rougeL": rouge_output["rougeL"].mid.fmeasure,
#     }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

# Train the model
# trainer.train(resume_from_checkpoint=True)
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Save the model and tokenizer after training
model.save_pretrained('./GPT2_20000')
tokenizer.save_pretrained('./GPT2_20000')



ModuleNotFoundError: No module named 'datasets'

In [None]:
# import torch
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# from datasets import load_dataset, load_metric
# import pandas as pd

# # Function to generate summaries
# def generate_summary(code_snippet):
#     inputs = tokenizer(code_snippet, return_tensors="pt", max_length=512, truncation=True).to(device)
#     outputs = model.generate(inputs.input_ids, max_length=128, num_beams=4, early_stopping=True)
#     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return summary

# # Example usage
# code_snippet = "def add(a, b):\n    return a + b"
# summary = generate_summary(code_snippet)
# print(f"Code:\n{code_snippet}\nSummary:\n{summary}")




In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
torch.cuda.empty_cache()
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the saved model and tokenizer
model_path = './GPT2'  # Change this to your actual model path if different
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)

# Add padding token if it's not there
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Function to generate a summary from a code snippet
def generate_summary(code_snippet, model, tokenizer):
    input_text = f"Summarize the following code: {code_snippet}\nSummary:"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = inputs.to(device)

    summary_ids = model.generate(
        inputs.input_ids,
        max_length=150,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
code_snippet = '''
let a=parseInt(4);
let b=parseInt(3);
let res=a+b;
return res;
'''

summary = generate_summary(code_snippet, model, tokenizer)
print("Generated Summary:")
print(summary)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
# from datasets import load_dataset, load_metric
# import torch
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# model = GPT2LMHeadModel.from_pretrained('gpt2')
# # Initialize the tokenizer and model
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = GPT2LMHeadModel.from_pretrained('gpt2')
# # Create a data collator
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False,
# )

# # Set up training arguments
# training_args = TrainingArguments(
#     output_dir='./GPT2',
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./GPT2',
#     logging_steps=10,
#     evaluation_strategy="epoch",
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_data,
#     eval_dataset=tokenized_val_data,
#     data_collator=data_collator,
# )


# # Determine the device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load the ROUGE metric
# rouge = load_metric("rouge")

# # Function to compute the metric
# def compute_metrics(pred):
#     labels_ids = pred.label_ids
#     pred_ids = pred.predictions

#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

#     # Compute the metric
#     rouge_output = rouge.compute(predictions=pred_str, references=labels_str)
#     return {
#         "rouge1": rouge_output["rouge1"].mid.fmeasure,
#         "rouge2": rouge_output["rouge2"].mid.fmeasure,
#         "rougeL": rouge_output["rougeL"].mid.fmeasure,
#     }

# # Add the compute_metrics function to the trainer
# trainer.compute_metrics = compute_metrics

# # Re-evaluate the model with the compute_metrics function
# eval_results = trainer.evaluate()
# print(f"Evaluation Results with ROUGE: {eval_results}")

In [None]:
import torch
torch.cuda.empty_cache()