In [61]:
import config, logging, os, random, tempfile, time, yaml
import datasets
import lamini
import torch
import transformers

lamini.api_key = os.getenv("4ebe383bfcbbfe832151b39255909ed120aa75e9e1e907d20be586c80cf7e2cd")

from lamini import Lamini
from utilities import *
from torch import nn, optim
from torch import functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments

In [62]:
logger = logging.getLogger(__name__)
global_config = None

use_hf = True  # True
dataset_path = 'lamini_docs.jsonl'
model_name = 'EleutherAI/pythia-410m'
training_config = {
    'model': {
        'pretrained_name': model_name,
        'mx_length': 2048},
    'datasets': {
        'use_hf': use_hf,
        'path': dataset_path},
    'verbose': True
}

In [63]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
print(train_dataset, test_dataset)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
}) Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


In [64]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug('Select GPU device')
    device = torch.device('cuda')
else:
    logger.debug('Select CPU device')
    device = torch.device('cpu')
base_model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [65]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize the input text
    input_ids = tokenizer.encode(text, 
                                 return_tensors='pt', 
                                 max_length=max_input_tokens, 
                                 truncation=True)
    # Generate the output
    device = model.device
    generated_tokens_with_prompt = model.generate(input_ids=input_ids.to(device),
                                                  max_length=max_output_tokens)
    # Decode the output
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt,
                                                        skip_special_tokens=True)
    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]
    return generated_text_answer
                                                  

### Try the base model

In [66]:
test_text = test_dataset[0]['question']
print('Question input (test): ', test_text)
print(f'Correct answer from Lamini docs: {test_dataset[0]["answer"]}')
print('Model\'s answer: ')
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test):  Can Lamini generate technical documentation or user manuals for software projects?
Correct answer from Lamini docs: Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
Model's answer: 


A:

I think you are looking for the Lamini documentation.

A:

I think you are looking for the Lamini documentation.

I think you are looking for the Lamini documentation.

I think you are looking for the Lamini documentation.

I think you are looking for the Lamini documentation.

I think you are looking


### Setup Training

In [67]:
max_steps = 100
trained_model_name = f'lamini_docs_{max_steps}_steps'
output_dir = trained_model_name
training_args = TrainingArguments(
    # lr
    learning_rate=1e-5,

    # Training steps
    num_train_epochs=1,

    # Max steps to train for (each step is a batch of data)
    # Overriddes num_train_epochs, if not -1
    max_steps=max_steps,

    # Batch size for training
    per_device_train_batch_size=1,

    # Directory to save the checkpoints
    output_dir=output_dir,
    
    # Other args
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    disable_tqdm=False,  # Disable tqdm progress bars
    eval_steps=120,  # Number of update steps between two evaluations
    save_steps=120,  # After how many steps to save the model
    warmup_steps=2,  # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1,  # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    prediction_loss_only=True,
    logging_steps=1,
    optim='adafactor',
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    # Params for early stop
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)



In [68]:
model_flops = (
    base_model.floating_point_ops(
        {
            'input_ids': torch.zeros(
                (1, training_config['model']['mx_length']),
            ),
        }
    )
    * training_args.gradient_accumulation_steps)

print(base_model)
print('Memory footprint', base_model.get_memory_footprint() / 1e9, 'GB')
print('Flops', model_flops / 1e9, 'GFLOPs')

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [69]:
trainer = transformers.Trainer(
    model=base_model,
    # model_flops=model_flops,
    # total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


In [70]:
training_output = trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

{'loss': 3.5541, 'grad_norm': 38.851478576660156, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 2.5024, 'grad_norm': 22.19594383239746, 'learning_rate': 1e-05, 'epoch': 0.01}
{'loss': 3.1272, 'grad_norm': 26.380901336669922, 'learning_rate': 9.89795918367347e-06, 'epoch': 0.01}
{'loss': 2.5447, 'grad_norm': 25.621217727661133, 'learning_rate': 9.795918367346939e-06, 'epoch': 0.01}
{'loss': 2.437, 'grad_norm': 18.78028106689453, 'learning_rate': 9.693877551020408e-06, 'epoch': 0.02}
{'loss': 2.2266, 'grad_norm': 25.858356475830078, 'learning_rate': 9.591836734693878e-06, 'epoch': 0.02}
{'loss': 3.0474, 'grad_norm': 32.238948822021484, 'learning_rate': 9.489795918367348e-06, 'epoch': 0.02}
{'loss': 2.2725, 'grad_norm': 22.35288429260254, 'learning_rate': 9.387755102040818e-06, 'epoch': 0.03}
{'loss': 2.0118, 'grad_norm': 21.57244110107422, 'learning_rate': 9.285714285714288e-06, 'epoch': 0.03}
{'loss': 2.3951, 'grad_norm': 26.357877731323242, 'learning_rate': 9.183673469387756e-06, 'epo

In [71]:
save_dir = f'{output_dir}/final'
trainer.save_model(save_dir)
print('Saved model to', save_dir)

Saved model to lamini_docs_100_steps/final


In [72]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [73]:
test_question = test_dataset[0]['question']
print('Question input (test): ', test_question)
print('Finetuned slightly model\'s answer: ')
print(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test):  Can Lamini generate technical documentation or user manuals for software projects?
Finetuned slightly model's answer: 
Yes, Lamini can generate technical documentation or user manuals for software projects. It can be used to train models and generate documentation for software projects. It can also be used to train models and generate documentation for hardware projects. Additionally, Lamini can be used to train models and generate documentation for data-intensive applications such as machine learning. Additionally, Lamini can be used to train models and generate documentation for


In [74]:
print('Target answer: ', test_dataset[0]['answer'])

Target answer:  Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
