In [None]:
!pip install datasets evaluate transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.1

# Preprocessing

In [None]:
from datasets import Dataset, load_dataset

In [None]:
raw_datasets_crypto = load_dataset("SocialGrep/reddit-crypto-aug-2021", 'comments')

Downloading builder script:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading and preparing dataset reddit-crypto-aug-2021/comments to /root/.cache/huggingface/datasets/SocialGrep___reddit-crypto-aug-2021/comments/1.0.0/a1ff130b46f2ea608c366e39d219bd90ff9f856ca60219fac5e0314fed9ff1d9...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/199M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset reddit-crypto-aug-2021 downloaded and prepared to /root/.cache/huggingface/datasets/SocialGrep___reddit-crypto-aug-2021/comments/1.0.0/a1ff130b46f2ea608c366e39d219bd90ff9f856ca60219fac5e0314fed9ff1d9. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Filter out cryptocurrency subreddit comments

raw_datasets_cryptocurrency= raw_datasets_crypto['train'].filter(lambda x: x['subreddit.name'] == 'cryptocurrency')

# Clean the dataset
# Won't care about removing newline symbols for example, as we want them to be included in the generated text as well (to give the generated comments more realistic appearance)
# The filter is not perfect, but should get rid of 99% unnecessary posts.
dataset_cryptocurrency = raw_datasets_cryptocurrency.filter(lambda x: x['body'] not in ['[deleted]', '[removed]'] \
                                                                 and 'I am a bot' not in x['body'] \
                                                                 and '![gif]' not in x['body'] \
                                                                 and 'http://' not in x['body'] \
                                                                 and 'https://' not in x['body'] \
                                                                 and '![img]' not in x['body'])

Filter:   0%|          | 0/3756097 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3301330 [00:00<?, ? examples/s]

In [None]:
# Select 50 000 random comments

crypto_50000 = dataset_cryptocurrency.shuffle(seed=42).select([i for i in range(50_000, 100_000)])

In [None]:
# Save just in case

crypto_50000.save_to_disk('crypto_data')
crypto_50000

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw', 'created_utc', 'permalink', 'body', 'sentiment', 'score'],
    num_rows: 50000
})

# Fine-tuning

In [None]:
from transformers import AutoTokenizer, GPT2Tokenizer

checkpoint = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    comments = []
    for comment in examples["body"]:
      comments.append(tokenizer.eos_token + comment + tokenizer.eos_token)

    return tokenizer(comments, truncation=True)

crypto_tokenized = crypto_50000.map(tokenize_function, batched=True, remove_columns=["type", "id", "subreddit.id", "subreddit.name", "subreddit.nsfw", "created_utc", "permalink", "body", "sentiment", "score"])
crypto_tokenized

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50000
})

In [None]:
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained(checkpoint)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments("training", per_device_train_batch_size=4, max_steps=12500, fp16=True) # fp16 makes the training more memory efficient

trainer = Trainer(
    model,
    training_args,
    train_dataset=crypto_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
trainer.train()



Step,Training Loss
500,4.6657
1000,4.4049
1500,4.4212
2000,4.3941
2500,4.38
3000,4.3312
3500,4.2952
4000,4.2934
4500,4.2971
5000,4.2933


TrainOutput(global_step=12500, training_loss=4.2670934765625, metrics={'train_runtime': 1532.1362, 'train_samples_per_second': 32.634, 'train_steps_per_second': 8.159, 'total_flos': 1147759953408000.0, 'train_loss': 4.2670934765625, 'epoch': 1.0})

In [None]:
# Saving model and tokenizer for inference

trainer.save_model('crypto')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

# Inference

In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model = 'crypto', tokenizer=tokenizer)
sentences = generator("Bitcoin is going to the", max_length = 150, num_return_sequences=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
sentences

[{'generated_text': "Bitcoin is going to the moon? I never thought about it, but a bit concerned how much ETH in the long term will keep it there. I haven't even looked past the hype of the project. And I'm not yet convinced that Bitcoin has the same potential as Ethereum, let alone the future. So there you go! \n\nHope there will be more info. Will just keep an eye out, especially for the comments you see above. I would also like to make some predictions if there ever is a crash. If it happens, it is going to show a real bubble when it actually comes out. So some investors will not do it right. \n\nBut it's just a matter of time for them to start coming up"},
 {'generated_text': 'Bitcoin is going to the moon someday 💚🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🔚🆞\U0001f2de🚀🚀🚀🚀🚀🚀🚀🛀🛀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🎆🚀🎆�'},
 {'generated_text': 'Bitcoin is going to the moon in time. Let’s go! 🙏�🏼🏼\u200d♂️\n\nGood luck and keep flying 🙏�🏼🍻🍻� boasts.\n\nI’m sure that’s the only way for us to go to the moon in time 😂😂👍♂️🏼🍻 🍻🍻🍻🍻🍻🍻🍻\u200d