In [1]:
!nvidia-smi
!nvidia-smi --query-gpu=name --format=csv,noheader | wc -l

Tue Sep 27 17:35:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN X (Pascal)    Off  | 00000000:03:00.0 Off |                  N/A |
| 27%   46C    P8    11W / 250W |    104MiB / 12188MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro P6000        Off  | 00000000:81:00.0 Off |                  Off |
| 26%   49C    P8    10W / 250W |  15264MiB / 24449MiB |      0%      Default |
|       

In [2]:
import os
import pandas as pd
import tqdm
import math
import torch
from torch.utils.data.dataset import Dataset

In [3]:
#Set the path to the data folder, datafile and output folder and files

op_folder_name = 'oct2022'

root_folder = '/users/kent/jmaharja/drugAbuse/'
output_folder = os.path.abspath(os.path.join(root_folder, 'output/'+ op_folder_name))
model_folder = os.path.abspath(os.path.join(output_folder, 'RoBERTaMLM/'))
tokenizer_folder = os.path.abspath(os.path.join(output_folder, 'TokRoBERTa/'))

datafile= '2020_01_01.csv'
testfile= '20161007.csv'
outputfile = 'submission.csv'

input_folder = os.path.abspath(os.path.join(root_folder, 'input/'))
datafile_path = os.path.abspath(os.path.join(input_folder, datafile))
testfile_path = os.path.abspath(os.path.join(input_folder, testfile))
outputfile_path = os.path.abspath(os.path.join(output_folder, outputfile))

In [4]:
train_df =pd.read_csv(datafile_path,lineterminator='\n',skipinitialspace=True, usecols= ['text'])
train_df.rename(columns={'text':'Tweet'}, inplace=True)
train_df = train_df.dropna()
train_df.shape

(1115630, 1)

# Train a language model from scratch

In [5]:
TRAIN_BATCH_SIZE = 16    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 8    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 1        # number of epochs to train (default: 10)
LEARNING_RATE = 1e-4    # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 7

In [6]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [7]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
print('Num parameters: ', model.num_parameters())

Num parameters:  49816064


In [8]:
# Check that PyTorch sees it
if torch.cuda.is_available():
    print("Let's use", torch.cuda.device_count(), "GPUs!")
#     model = torch.nn.DataParallel(model)


Let's use 2 GPUs!


In [9]:
from transformers import RobertaTokenizerFast
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

In [10]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
train_df, test_df = train_test_split(train_df, test_size=0.1, random_state=RANDOM_SEED)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=RANDOM_SEED)

# Building the training Dataset

In [11]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        
        for example in df.values:
            x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [12]:
# Create the train and evaluation dataset
train_dataset = CustomDataset(train_df['Tweet'], tokenizer)
eval_dataset = CustomDataset(val_df['Tweet'], tokenizer)

In [13]:
train_df.iloc[0]
train_dataset.__getitem__(0)

tensor([   0,   54,   56,  265,  979,  675,  543, 3326,   94,   30,  933,  402,
        1447,  396, 5212,   18,  933,  402, 1447, 1563,   18,  933,  402, 1447,
        7672,  619,  358, 7074,   18,  933,  402, 1447,  795, 1503, 2240,   18,
         933,  402,  307,    2])

In [14]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [15]:
from torch import nn
from transformers import Trainer, TrainingArguments

In [16]:
#from transformers import Trainer, TrainingArguments
print(model_folder)
# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_folder,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

/users/kent/jmaharja/drugAbuse/output/oct2022/RoBERTaMLM


In [None]:
# Train the model
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running training *****
  Num examples = 1004067
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 31378


Epoch,Training Loss,Validation Loss


In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model(model_folder)