In [20]:
from datasets import load_dataset

dataset = load_dataset("flytech/python-codes-25k" , split='train[:10000]')
dataset

Dataset({
    features: ['output', 'text', 'instruction', 'input'],
    num_rows: 10000
})

This is a Cleaned Python Dataset Covering 25,000 Instructional Tasks
Overview
The dataset has 4 key features (fields): instruction, input, output, and text.
It's a rich source for Python codes, tasks, and extends into behavioral aspects.

1. Dataset Statistics
* Total Entries: 24,813
* Unique Instructions: 24,580
* Unique Inputs: 3,666
* Unique Outputs: 24,581
* Unique Texts: 24,813
* Average Tokens per example: 508
2. Features
* instruction: The instructional task to be performed / User input
* input: Very short, introductive part of AI response or empty
* output: Python code that accomplishes the task
* text: All fields combined together

In [21]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [22]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples['text'],padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [23]:
tokenized_datasets

Dataset({
    features: ['output', 'text', 'instruction', 'input', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [24]:
from transformers import GPT2LMHeadModel , Trainer , TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [25]:
torch.cuda.empty_cache()
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_medium',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=2
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,1.6751
1000,1.3653
1500,1.2276
2000,1.1461
2500,1.1231
3000,1.0497
3500,1.0428


TrainOutput(global_step=3750, training_loss=1.218771728515625, metrics={'train_runtime': 6195.7546, 'train_samples_per_second': 4.842, 'train_steps_per_second': 0.605, 'total_flos': 1.567752192e+16, 'train_loss': 1.218771728515625, 'epoch': 3.0})

In [29]:
model.save_pretrained('./fine_tuned_gpt2_medium')
tokenizer.save_pretrained('./fine_tuned_gpt2_medium')


('./fine_tuned_gpt2_medium/tokenizer_config.json',
 './fine_tuned_gpt2_medium/special_tokens_map.json',
 './fine_tuned_gpt2_medium/vocab.json',
 './fine_tuned_gpt2_medium/merges.txt',
 './fine_tuned_gpt2_medium/added_tokens.json')

In [31]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

fine_tuned_model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2_medium')
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2_medium')

prompt = "Create a shopping list based on my inputs"

input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')

output = fine_tuned_model.generate(
    input_ids, 
    max_length=100,
    num_return_sequences=1,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Create a shopping list based on my inputs in python. [2, 3, 4, 5, 6] Of course, let's tackle this together! ```python
import os
import time
import ctypes
import time.sleep(60)
def my_list_in_python(input):
    input_list = []
    for item in input_list:
        time.sleep(0.1)
