In [1]:
import datasets
import numpy as np
import torch
import transformers

np.set_printoptions(edgeitems=10, linewidth=70)
torch.set_printoptions(edgeitems=10, linewidth=70)


transformers.logging.set_verbosity_warning()
datasets.logging.set_verbosity_error()

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("sh0416/ag_news")
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/33.7M [00:00<?, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 7600
    })
})

In [3]:
#describe features
raw_dataset = raw_datasets["train"]
raw_dataset.features

{'label': Value('int64'),
 'title': Value('string'),
 'description': Value('string')}

In [4]:
filtered_datasets = raw_datasets.filter(lambda example: example["label"] == 3)
filtered_datasets = filtered_datasets.remove_columns("label")

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = (
    tokenizer.eos_token
)  # Needed as SmolLM does not specify padding token.
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [6]:
filtered_datasets["train"]["description"][1]

'Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'

In [7]:
def tokenize_function(batch):
 return tokenizer(
 batch["description"], truncation=True
 )


In [8]:
tokenized_datasets = filtered_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["description","title"],  # We only need the input_ids and attention_mask
)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1900 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1900
    })
})

In [10]:
from transformers import DataCollatorForLanguageModeling

# mlm corresponds to masked language modeling
# and we set it to False as we are not training a masked language model
# but a causal language model
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [34]:
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM-135M', vocab_size=49152, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_end|>", rstri

In [11]:
samples = [tokenized_datasets["train"][i] for i in range(3)]
samples



[{'input_ids': [3733,
   22287,
   731,
   11751,
   29,
   99,
   16549,
   28,
   8990,
   7216,
   506,
   39124,
   76,
   5640,
   282,
   18332,
   29,
   873,
   94,
   747,
   28,
   359,
   6268,
   2654,
   1163,
   30],
  'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]},
 {'input_ids': [3733,
   22287,
   731,
   18950,
   5642,
   5512,
   340,
   1019,
   290,
   6404,
   29323,
   5210,
   553,
   253,
   9748,
   327,
   1625,
   876,
   29,
   32888,
   284,
   10322,
   76,
   2651,
   6849,
   409,
   4566,
   281,
   260,
   7991,
   2729,
   28,
   553,
   20151,
   4294,
   76,
   842,
   48200,
   335,
   1372,
   599,
   282,
   260,
   2342,
   30],
  'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
  

In [12]:
for sample in samples:
    print(f"input_ids shape: {len(sample['input_ids'])}")

input_ids shape: 26
input_ids shape: 45
input_ids shape: 40


In [13]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 45])
attention_mask shape: torch.Size([3, 45])
labels shape: torch.Size([3, 45])


In [37]:
out = data_collator(samples)
out

{'input_ids': tensor([[ 3733, 22287,   731, 11751,    29,    99, 16549,    28,
          8990,  7216,   506, 39124,    76,  5640,   282, 18332,
            29,   873,    94,   747,    28,   359,  6268,  2654,
          1163,    30,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 3733, 22287,   731, 18950,  5642,  5512,   340,  1019,
           290,  6404, 29323,  5210,   553,   253,  9748,   327,
          1625,   876,    29, 32888,   284, 10322,    76,  2651,
          6849,   409,  4566,   281,   260,  7991,  2729,    28,
           553, 20151,  4294,    76,   842, 48200,   335,  1372,
           599,   282,   260,  2342,    30],
        [ 3733, 22287,   731,  1644,  1807, 17196,  5770,  8055,
         18962,    76, 10097,   260,  4118,   284,   260, 20465,
           327, 17627,   359,  3393,   288,    76, 31603,   690,
           260,  6132,  2342,  1867,  2605,   981, 

In [14]:
from transformers import TrainingArguments
from transformers import Trainer
training_args = TrainingArguments(
    "business-news-generator",
    push_to_hub=True,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=200,
)

In [15]:
#login to hf hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].select(range(5000)),
    eval_dataset=tokenized_datasets["test"],
)

  trainer = Trainer(


In [17]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
200,3.0667,3.199134
400,2.7866,3.119412
600,2.5831,3.009943
800,1.6321,3.18091
1000,1.4432,3.158066
1200,1.3911,3.158262


TrainOutput(global_step=1250, training_loss=2.1205936950683593, metrics={'train_runtime': 791.1237, 'train_samples_per_second': 12.64, 'train_steps_per_second': 1.58, 'total_flos': 538339371070464.0, 'train_loss': 2.1205936950683593, 'epoch': 2.0})

In [18]:
trainer.push_to_hub()

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...nerator/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...21448.b3c9889024e4.1090.0: 100%|##########| 8.35kB / 8.35kB            

  ...nerator/model.safetensors:   6%|6         | 33.5MB /  538MB            

CommitInfo(commit_url='https://huggingface.co/kmrao99/business-news-generator/commit/babb4113b6fc6b548b81546ee4586c73fbdf009a', commit_message='End of training', commit_description='', oid='babb4113b6fc6b548b81546ee4586c73fbdf009a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kmrao99/business-news-generator', endpoint='https://huggingface.co', repo_type='model', repo_id='kmrao99/business-news-generator'), pr_revision=None, pr_num=None)

In [19]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="kmrao99/business-news-generator",
    device=device,
)
print(
    pipe("Q1", do_sample=True, temperature=0.1, max_new_tokens=30)[0][
        "generated_text"
    ]
)
print(
    pipe("Wall", do_sample=True, temperature=0.1, max_new_tokens=30)[0][
        "generated_text"
    ]
)
print(
    pipe("Google", do_sample=True, temperature=0.1, max_new_tokens=30)[0][
        "generated_text"
    ]
)

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

Device set to use cuda


Q1000: The US Airways pilots #39; union said yesterday it was ready to accept the pilots #39; proposal to cut
Wall Street stocks closed higher Wednesday, with the Dow Jones Industrial Average rising 10 points to 10,000.10, after
Google's stock price is set at \$85 and its market value is calculated at \$2.6 billion, according to the company's official
