In [1]:
from datasets import load_dataset

dataset = load_dataset("flytech/python-codes-25k" , split='train[:10000]')
dataset

Downloading readme:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 26.4M/26.4M [00:00<00:00, 60.0MB/s]
Downloading data: 100%|██████████| 25.4M/25.4M [00:00<00:00, 77.8MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'text', 'output', 'input'],
    num_rows: 10000
})

This is a Cleaned Python Dataset Covering 25,000 Instructional Tasks
Overview
The dataset has 4 key features (fields): instruction, input, output, and text.
It's a rich source for Python codes, tasks, and extends into behavioral aspects.

1. Dataset Statistics
* Total Entries: 24,813
* Unique Instructions: 24,580
* Unique Inputs: 3,666
* Unique Outputs: 24,581
* Unique Texts: 24,813
* Average Tokens per example: 508
2. Features
* instruction: The instructional task to be performed / User input
* input: Very short, introductive part of AI response or empty
* output: Python code that accomplishes the task
* text: All fields combined together

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
tokenized_datasets

Dataset({
    features: ['instruction', 'text', 'output', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [5]:
from transformers import Trainer , TrainingArguments , AutoTokenizer
from transformers import DataCollatorForLanguageModeling
import torch

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

2024-05-25 18:00:15.119372: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 18:00:15.119474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 18:00:15.239997: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
torch.cuda.empty_cache()
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for saved model and logs
    num_train_epochs=3,     # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training 
    per_device_eval_batch_size=8,   # Batch size for evaluation
    learning_rate=2e-5,            # Learning rate
    warmup_steps=500,              # Number of warmup steps (optional)
    save_strategy="epoch",        # Save checkpoint after each epoch
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.0525
1000,0.0004
1500,0.0001
2000,0.0001
2500,0.0001
3000,0.0001
3500,0.0001


TrainOutput(global_step=3750, training_loss=0.007127240062505007, metrics={'train_runtime': 2041.3198, 'train_samples_per_second': 14.696, 'train_steps_per_second': 1.837, 'total_flos': 7896144384000000.0, 'train_loss': 0.007127240062505007, 'epoch': 3.0})

In [7]:
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')


('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json',
 './fine_tuned_bert/tokenizer.json')

In [11]:
from transformers import AutoTokenizer

fine_tuned_model = AutoModelForMaskedLM.from_pretrained('./fine_tuned_bert')
fine_tuned_tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_bert')

prompt = "Create a to do list"

input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')

output = fine_tuned_model.generate(
    input_ids, 
    max_length=100,
    num_return_sequences=1,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


create a to do list.......................... 2005. 2004.... 2010.... 2007.... 2010.... 2010.... 2010. 2010.... 2010.... 2010.... 2010. 2010.... 2010. 2010.... 2010.... 2010.. 2010.... 2010. 2010.... 2010.... 2010.... 2010. 2010.... 2010.. 2010.... 2010. 2010.. 2010.... 2010.... 2010....
