In [1]:
!pip install datasets evaluate transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Choosing a dataset
from datasets import Dataset, load_dataset

raw_datasets = load_dataset("SocialGrep/reddit-wallstreetbets-aug-2021", 'comments', streaming=True) # With streaming=True we don't have to download the entire dataset
raw_datasets_50000 = raw_datasets['train'].take(50000) # Take first 50000 rows

In [3]:
# Creating a dataset object

comments = []

for e in raw_datasets_50000:
  comments.append(e['body'])

dataset = Dataset.from_dict({"comments": comments})
print(dataset)

Dataset({
    features: ['comments'],
    num_rows: 50000
})


In [4]:
# Selecting a checkpoint (name of pretrained model in Huggingface) and appropriate tokenizer
from transformers import AutoTokenizer, GPT2Tokenizer

checkpoint = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["comments"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["comments"])
tokenized_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50000
})

In [6]:
# Setup for training
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained(checkpoint)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments("training", per_device_train_batch_size=4, max_steps=2000, fp16=True) # fp16 makes the training more memory efficient

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [7]:
trainer.train()



Step,Training Loss
500,4.2845
1000,4.0578
1500,3.9945
2000,3.9482


TrainOutput(global_step=2000, training_loss=4.071251403808594, metrics={'train_runtime': 276.0265, 'train_samples_per_second': 28.983, 'train_steps_per_second': 7.246, 'total_flos': 222026780160000.0, 'train_loss': 4.071251403808594, 'epoch': 0.16})

In [8]:
# Saving model and tokenizer for inference

trainer.save_model('model_2000')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

In [10]:
# Testing inference
from transformers import pipeline
generator = pipeline('text-generation', model = 'model_2000', tokenizer=tokenizer)
generator("Going all in with my", max_length = 60, num_return_sequences=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Going all in with my wife's boyfriend and all my girlfriends.  I hope i make enough money off the loans to get back on the road with him.  \n\n\nI just hope you like the future and have a strong one. \n\nAnd I can see how you do"},
 {'generated_text': "Going all in with my wife and my daughter. I hope they're okay and will start using a less expensive car now that I'm in the US. \n\nAll of my calls go back to $700. I am at $300 with my wife and daughter. When the sun goes down"},
 {'generated_text': "Going all in with my BTM puts of the week, i dont think its going to hit 1000, so i've only bought 5 bt w/ a 50% daily fee to make sure those get done when I'm fully booked in. \n\nBut   Im sure it could go"},
 {'generated_text': "Going all in with my portfolio, don't be a bit greedy with that, you only get 70k a year that you wouldn't have had in those days..but the portfolio is very similar to your portfolio. It's a lot. Can you say a lot of that about that? \n"},
 {