# Load data

In [1]:
from datasets import load_dataset

data = load_dataset('reddit')['train']

Using custom data configuration default
Reusing dataset reddit (/home/jhuertas/.cache/huggingface/datasets/reddit/default/1.0.0/98ba5abea674d3178f7588aa6518a5510dc0c6fa8176d9653a3546d5afcb3969)


  0%|          | 0/1 [00:00<?, ?it/s]

In [187]:
import pandas as pd
import numpy as np

big_dataset = pd.DataFrame({'id': data['author'], 'text': data['body'], 'subreddit': data['subreddit']})

In [188]:
value_counts = big_dataset.id.value_counts()
valid_authors = value_counts[value_counts > 2][1:].index.tolist()
big_dataset_valid = big_dataset[big_dataset.id.isin(valid_authors)]
in_test = pd.Series(big_dataset_valid.id.unique()).sample(frac=.1).tolist()


In [190]:
big_dataset_train = big_dataset_valid[~big_dataset_valid.id.isin(in_test)]
big_dataset_test = big_dataset_valid[big_dataset_valid.id.isin(in_test)]

In [183]:
sum(big_dataset_train.id.value_counts() <= 1)

0

In [182]:
sum(big_dataset_test.id.value_counts() <= 1)

0

In [178]:
big_dataset_test.set_index('id').loc['Ilikemesomenofap'].sample(2)

Unnamed: 0_level_0,text,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
Ilikemesomenofap,I was about to relapse... Have been fantasizin...,NoFap
Ilikemesomenofap,"One of the most common errors of nofappers,to ...",NoFap


In [207]:
import csv
big_dataset_train.to_csv('local_data/reddit_train.csv', index=False, quoting=csv.QUOTE_ALL)
big_dataset_test.to_csv('local_data/reddit_test.csv', index=False, quoting=csv.QUOTE_ALL)

In [208]:
train = pd.read_csv('local_data/reddit_train.csv').sample(frac=1.)
test = pd.read_csv('local_data/reddit_test.csv')

In [210]:
print(sum(big_dataset_train.id.value_counts() <= 1))
print(sum(big_dataset_test.id.value_counts() <= 1))
print(sum(train.id.value_counts() <= 1))
print(sum(test.id.value_counts() <= 1))

0
0
0
0


In [211]:
train

Unnamed: 0,id,text,subreddit
1677882,RA_THROWAWY,Update #1: \n\nUpdate #2: \n\nWe've been datin...,relationships
288581,RockLoi,"I've read a couple of novels like this, but I ...",woahdude
1667413,aprilvu,"Hey guys, so in March my husband and I bought ...",personalfinance
1272215,Tsurii,"Of course it's conspiracy theory stuff, the Ma...",WTF
659148,OneTrickPony82,You need an assumption about draw rate to calc...,chess
...,...,...,...
951636,Canadutchian,"It has worked that way in the past, but I woul...",ClickerHeroes
537450,Misanthropy-Divine,"Regardless of how you perceive someone, we're ...",howtonotgiveafuck
256990,carebanana,I hate HP with every fibre of my being. About ...,AskReddit
1724102,Piercemxpx1,So I haven't had any eventful drug stories rec...,Drugs


# Load data (Local)

In [17]:
import pandas as pd
from data import build_dataset
from transformers import AutoTokenizer

train = pd.read_csv('local_data/reddit_train.csv').sample(frac=1.)
test = pd.read_csv('local_data/reddit_test.csv')

train['unique_id'] = train.index.astype(str)
test['unique_id'] = test.index.astype(str)

BATCH_SIZE = 16384
VALID_BATCH_SIZE = 1000
CHUNK_SIZE = 512
TRAINING_STEPS = 3000
VALIDATION_STEPS = 500
WARMUP_STEPS = 0

train_data = build_dataset(train,
                           steps=TRAINING_STEPS*BATCH_SIZE,
                           batch_size=BATCH_SIZE,
                           num_workers=8, 
                           prefetch_factor=8,
                           max_len=CHUNK_SIZE,
                           tokenizer = AutoTokenizer.from_pretrained('roberta-base'),
                           mode='text')
test_data = build_dataset(test, 
                          steps=VALIDATION_STEPS*VALID_BATCH_SIZE, 
                          batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, 
                          prefetch_factor=4, 
                          max_len=CHUNK_SIZE,
                          tokenizer = AutoTokenizer.from_pretrained('roberta-base'),
                          mode='text')

  train = pd.read_csv('local_data/reddit_train.csv').sample(frac=1.)
  test = pd.read_csv('local_data/reddit_test.csv')


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [18]:
import wandb

from datetime import datetime
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning import Trainer

from model_experimental import (ContrastiveLSTMTransformer,
                                )

# Name model
date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
save_name = f'final_{date_time}'
print(f'Saving model to {save_name}')

wandb.login()
wandb_logger = WandbLogger(name=save_name, project="author_profiling_reddit")
checkpoint_callback = ModelCheckpoint('model',
                                      filename=save_name,
                                      monitor=None,
                                      every_n_val_epochs=1,
                                      )
lr_monitor = LearningRateMonitor('step')

# Define training arguments
trainer = Trainer(devices=0,
                  max_steps=3000,
                  accelerator='gpu',
                  log_every_n_steps=1,
                  flush_logs_every_n_steps=500,
                  logger=wandb_logger,
                  precision=16,
                  val_check_interval=250,
                  callbacks=[checkpoint_callback, lr_monitor],
                  )

# Define model
base_transformer = AutoModel.from_pretrained('roberta-large')
train_model = ContrastiveLSTMTransformer(base_transformer,
                                         learning_rate=1e-2,
                                         weight_decay=.01,
                                         num_warmup_steps=0,
                                         num_training_steps=3000,
                                         enable_scheduler=True,
                                         minibatch_size=256,)

trainer.fit(train_model, train_data, test_data)
wandb.finish()

Saving model to final_2022-06-08_15-49-38


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjahuerta92[0m (use `wandb login --relogin` to force relogin)
  rank_zero_deprecation(


ValueError: Mismatch between the requested accelerator type (GPU) and assigned device type (CPU).

In [19]:
!nvidia-smi

Wed Jun  8 15:50:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 8000     Off  | 00000000:37:00.0 Off |                  Off |
| 33%   29C    P8    15W / 260W |   1631MiB / 49152MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000     Off  | 00000000:86:00.0 Off |                  Off |
| 59%   79C    P2   240W / 260W |  41421MiB / 49152MiB |    100%      Default |
|       