In [None]:
!pip install datasets evaluate transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Loading and preprocessing data

Some datasets can not be loaded/streamed from Hugginface because of various errors. These datasets are loaded with wget instead.

50 000 examples are taken randomly from each dataset for classification training and unified dataset is created from these where each comment is labeled accordingly 0, 1, ... in the order the data was processed.

Datasets are chosen randomly - mostly according to what was already available.

In [None]:
from datasets import Dataset, load_dataset

## Subreddit: cryptocurrency

It is not possible to use streaming=True argument to take smaller amount of comments because the dataset contains more than a one subreddit. We have to load the entire dataset and then use filter to retrieve comments belonging to the cryptocurrency subreddit.

In [None]:
raw_datasets_crypto = load_dataset("SocialGrep/reddit-crypto-aug-2021", 'comments')

Downloading builder script:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading and preparing dataset reddit-crypto-aug-2021/comments to /root/.cache/huggingface/datasets/SocialGrep___reddit-crypto-aug-2021/comments/1.0.0/a1ff130b46f2ea608c366e39d219bd90ff9f856ca60219fac5e0314fed9ff1d9...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/199M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset reddit-crypto-aug-2021 downloaded and prepared to /root/.cache/huggingface/datasets/SocialGrep___reddit-crypto-aug-2021/comments/1.0.0/a1ff130b46f2ea608c366e39d219bd90ff9f856ca60219fac5e0314fed9ff1d9. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Filter out cryptocurrency subreddit comments

raw_datasets_cryptocurrency= raw_datasets_crypto['train'].filter(lambda x: x['subreddit.name'] == 'cryptocurrency')

# Clean the dataset
# Won't care about removing newline symbols for example, as we want them to be included in the generated text as well (to give the generated comments more realistic appearance)
# The filter is not perfect, but should get rid of 99% unnecessary posts.
dataset_cryptocurrency = raw_datasets_cryptocurrency.filter(lambda x: x['body'] not in ['[deleted]', '[removed]'] \
                                                                 and 'I am a bot' not in x['body'] \
                                                                 and '![gif]' not in x['body'] \
                                                                 and 'http://' not in x['body'] \
                                                                 and 'https://' not in x['body'] \
                                                                 and '![img]' not in x['body'])

Filter:   0%|          | 0/3756097 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3301330 [00:00<?, ? examples/s]

In [None]:
# Select 50 000 random comments

comments_cryptocurrency = []
labels_cryptocurrency = [0 for i in range(50_000)]

comments_cryptocurrency_50000 = dataset_cryptocurrency.shuffle(seed=42).select([i for i in range(50_000)])
for comment in comments_cryptocurrency_50000:
  comments_cryptocurrency.append(comment['body'])

## Subreddit: meirl

In [None]:
raw_dataset_meirl = load_dataset("SocialGrep/the-reddit-irl-dataset", 'comments', streaming=True)
raw_dataset_meirl_250000 = raw_dataset_meirl['train'].take(250_000)

Downloading builder script:   0%|          | 0.00/7.86k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

In [None]:
# Filter and clean dataset

dataset_meirl = raw_dataset_meirl_250000.filter(lambda x: x['body'] not in ['[removed]', '[deleted]'] \
                                                         and 'http://' not in x['body'] \
                                                         and 'https://' not in x['body'] \
                                                         and 'All posts must be titled "meirl"' not in x['body'] \
                                                         and '![gif]' not in x['body'] \
                                                         and '![img]' not in x['body'] \
                                                         and 'I am a bot' not in x['body'])

In [None]:
# Select 50 000 random comments

comments_meirl = []
labels_meirl = [1 for i in range(50_000)]

for comment in dataset_meirl.shuffle(seed=42).take(50_000):
  comments_meirl.append(comment['body'])

## Subreddit: wallstreetbets

In [None]:
raw_datasets_wsb = load_dataset("SocialGrep/reddit-wallstreetbets-aug-2021", 'comments', streaming=True)
raw_datasets_wsb_250000 = raw_datasets_wsb['train'].take(250_000)

Downloading builder script:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

In [None]:
# Filter and clean dataset
dataset_wsb = raw_datasets_wsb_250000.filter(lambda x: x['body'] not in ['[removed]', '[deleted]'] \
                                                    and '![gif]' not in x['body'] \
                                                    and 'http://' not in x['body'] \
                                                    and 'https://' not in x['body'] \
                                                    and '![img]' not in x['body'] \
                                                    and 'I am a bot,' not in x['body'])

In [None]:
# Select 50 000 random comments

comments_wsb = []
labels_wsb = [2 for i in range(50_000)]

for comment in dataset_wsb.shuffle(seed=42).take(50_000):
  comments_wsb.append(comment['body'])

## Subreddit: antiwork

Streaming the dataset is not possible and loading the dataset throws an error as well for some samples. wget and pandas are used for selecting comments.

In [None]:
import pandas as pd

In [None]:
!wget https://exports.socialgrep.com/download/public/the-antiwork-subreddit-dataset-comments.csv.zip
!unzip the-antiwork-subreddit-dataset-comments.csv.zip

In [None]:
# Create dataframe

df_aw_250000 = pd.read_csv('the-antiwork-subreddit-dataset-comments.csv').sample(250_000, random_state=42)

In [None]:
# Filter and clean dataset

df_aw = df_aw_250000[(df_aw_250000['body'].notnull()) &
                       (df_aw_250000['body'] != '[removed]') &
                       (df_aw_250000['body'] != '[deleted]') &
                       (~df_aw_250000['body'].str.contains('\*I am a bot', na=False)) &
                       (~df_aw_250000['body'].str.contains('http://', na=False)) &
                       (~df_aw_250000['body'].str.contains('https://', na=False)) &
                       (~df_aw_250000['body'].str.contains('![gif]', na=False)) &
                       (~df_aw_250000['body'].str.contains('![img]', na=False))]

In [None]:
# Select 50 000 random comments

comments_aw = []
labels_aw = [3 for i in range(50_000)]

for _, comment in df_aw.sample(50_000, random_state=42).iterrows():
  comments_aw.append(comment['body'])

## Subreddit: other

This is a class that is made up of randomly sampled subreddits (can not include previous subreddits) that should represent everything else.

In [None]:
raw_dataset_other = load_dataset("reddit", streaming=True)
raw_dataset_other_250000 = raw_dataset_other['train'].take(250_000)

Downloading builder script:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

In [None]:
# Filter and clean dataset

dataset_other = raw_dataset_other_250000.filter(lambda x: x['subreddit'] not in ['cryptocurrency', 'meirl', 'wallstreetbets', 'antiwork'] \
                                                         and x['body'] not in ['[removed]', '[deleted]'] \
                                                         and 'http://' not in x['body'] \
                                                         and 'https://' not in x['body'] \
                                                         and '![gif]' not in x['body'] \
                                                         and '![img]' not in x['body'] \
                                                         and 'I am a bot' not in x['body'])

In [None]:
# Select 50 000 random comments

comments_other = []
labels_other = [4 for i in range(50_000)]

for comment in dataset_other.shuffle(seed=42).take(50_000):
  comments_other.append(comment['body'])

# Creating train, validation and test datasets from unified dataset

In [None]:
comments_all = comments_cryptocurrency + comments_meirl + comments_wsb + comments_aw + comments_other
labels_all = labels_cryptocurrency + labels_meirl + labels_wsb + labels_aw + labels_other

In [None]:
dataset_unified = Dataset.from_dict({'comment': comments_all, 'label': labels_all})

Dataset will be split 80/10/10

In [None]:
dataset_first_split = dataset_unified.train_test_split(test_size=0.2, seed=42)
dataset_second_split = dataset_first_split['test'].train_test_split(test_size=0.5, seed=42)

dataset_train = dataset_first_split['train']
dataset_val = dataset_second_split['train']
dataset_test = dataset_second_split['test']

In [None]:
print('Training dataset')
print(dataset_train)

print('\nValidation dataset')
print(dataset_val)

print('\nTest dataset')
print(dataset_test)

Training dataset
Dataset({
    features: ['comment', 'label'],
    num_rows: 200000
})

Validation dataset
Dataset({
    features: ['comment', 'label'],
    num_rows: 25000
})

Test dataset
Dataset({
    features: ['comment', 'label'],
    num_rows: 25000
})


In [None]:
# Save datasets to disk

dataset_train.save_to_disk('train')
dataset_val.save_to_disk('val')
dataset_test.save_to_disk('test')

Saving the dataset (0/1 shards):   0%|          | 0/200000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

# Training BERT classifier

In [None]:
# Load datasets
from datasets import load_from_disk

dataset_train = load_from_disk('train')
dataset_val = load_from_disk('val')
dataset_test = load_from_disk('test')

In [None]:
# Loading pretrained BERT model and tokenizer
from transformers import DistilBertTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

checkpoint = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [None]:
def tokenize_function(data):
    return tokenizer(data['comment'], truncation=True)

dataset_train_tokenized = dataset_train.map(tokenize_function, batched=True)
dataset_val_tokenized = dataset_val.map(tokenize_function, batched=True)
dataset_test_tokenized = dataset_test.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [None]:
# Based on: https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    max_steps=1000,
    eval_steps=500,
    logging_steps=100,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train_tokenized,
    eval_dataset=dataset_val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
500,0.8309,0.796138
1000,0.7662,0.742055


TrainOutput(global_step=1000, training_loss=0.8919328842163086, metrics={'train_runtime': 1184.6469, 'train_samples_per_second': 13.506, 'train_steps_per_second': 0.844, 'total_flos': 1586041805098944.0, 'train_loss': 0.8919328842163086, 'epoch': 0.08})

In [None]:
trainer.save_model('classifier_bert_fine_tuned')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json')

## Evaluating on the test data

In [None]:
import evaluate
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
dataset_test_tokenized = dataset_test_tokenized.remove_columns(['comment'])
dataset_test_tokenized = dataset_test_tokenized.rename_column('label', 'labels')

dataloader_test = DataLoader(
    dataset_test_tokenized, batch_size=16, collate_fn=data_collator
)

In [None]:
# Just a check
for batch in dataloader_test:
    break

In [None]:
accuracy_metric = evaluate.load("accuracy")
progress_bar = tqdm(range(len(dataloader_test)))

model.eval()

for batch in dataloader_test:
  output = model(**batch.to(device))

  logits = output.logits
  predictions = torch.argmax(logits, dim=-1)

  accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
  progress_bar.update(1)

accuracy_metric.compute()

  0%|          | 0/1563 [00:00<?, ?it/s]

{'accuracy': 0.6974}