In [1]:
!nvidia-smi
!nvidia-smi --query-gpu=name --format=csv,noheader | wc -l

Wed Sep 28 12:14:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN X (Pascal)    Off  | 00000000:03:00.0 Off |                  N/A |
| 23%   28C    P8     9W / 250W |    104MiB / 12188MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro P6000        Off  | 00000000:81:00.0 Off |                  Off |
| 26%   21C    P8     8W / 250W |  15264MiB / 24449MiB |      0%      Default |
|       

In [2]:
import os
import pandas as pd
import tqdm
import math

In [3]:
#Set the path to the data folder, datafile and output folder and files

op_folder_name = 'oct2022'

root_folder = '/users/kent/jmaharja/drugAbuse/'
output_folder = os.path.abspath(os.path.join(root_folder, 'output/'+ op_folder_name))
model_folder = os.path.abspath(os.path.join(output_folder, 'RoBERTaMLM/'))
tokenizer_folder = os.path.abspath(os.path.join(output_folder, 'TokRoBERTa/'))

datafile= '2020_01_01.csv'
testfile= '20161007.csv'
outputfile = 'submission.csv'

input_folder = os.path.abspath(os.path.join(root_folder, 'input/'))
datafile_path = os.path.abspath(os.path.join(input_folder, datafile))
testfile_path = os.path.abspath(os.path.join(input_folder, testfile))
outputfile_path = os.path.abspath(os.path.join(output_folder, outputfile))

In [4]:
train_df =pd.read_csv(datafile_path,lineterminator='\n',skipinitialspace=True, usecols= ['text'])
train_df.rename(columns={'text':'Tweet'}, inplace=True)
train_df = train_df.dropna()
train_df.shape

(1115630, 1)

In [5]:
df2 =pd.read_csv('/users/kent/jmaharja/drugAbuse/input/2020_01_31_CleanedTweets.csv',
                lineterminator='\n',
                skipinitialspace=True
                )
df2.drop(df2.columns[[0, 1]], axis=1, inplace=True)
df2.rename({'text': 'Tweet'}, axis=1, inplace=True)
df = pd.concat([train_df,df2])
df.shape

(2281116, 1)

# Build a Tokenizer

In [None]:
# Drop the files from the output dir
!mkdir -p {output_folder}
txt_files_dir = output_folder + "/text_split"

!rm -rf {txt_files_dir}
!mkdir {txt_files_dir}

In [6]:
# Store values in a dataframe column (Series object) to files, one file per record
# The prefix is a unique ID to avoid to overwrite a text file
def column_to_files(column, prefix, txt_files_dir):
    i=prefix
    for row in column.to_list():
      file_name = os.path.join(txt_files_dir, str(i)+'.txt')
      try:
        f = open(file_name, 'wb')
        f.write(row.encode('utf-8'))
        f.close()
      except Exception as e: 
        print(row, e) 
      i+=1
    return i

In [None]:
data = df["Tweet"]
data = data.replace("\n"," ")
prefix = 0
#Create a file for every description value
prefix = column_to_files(data, prefix, txt_files_dir)
print(prefix)

In [7]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from torch.utils.data.dataset import Dataset

In [8]:
%%time 
paths = [str(x) for x in Path('/users/kent/jmaharja/drugAbuse/output/oct2022/').glob("text_split/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=8192, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])

CPU times: user 6h 21min 19s, sys: 2h 50min 48s, total: 9h 12min 8s
Wall time: 1h 2min 41s


In [9]:
#Save the Tokenizer to disk
tokenizer.save_model(tokenizer_folder)

['/users/kent/jmaharja/drugAbuse/output/oct2022/TokRoBERTa/vocab.json',
 '/users/kent/jmaharja/drugAbuse/output/oct2022/TokRoBERTa/merges.txt']

In [15]:
tokenizer_folder

'/users/kent/jmaharja/drugAbuse/output/oct2022/TokRoBERTa'

In [18]:
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [19]:
tokenizer.encode("cook some blue.")

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
tokenizer.encode("cook some blue.").special_tokens_mask

[1, 0, 0, 0, 0, 1]

In [21]:
tokenizer.encode("cook some blue.").tokens

['<s>', 'cook', 'Ġsome', 'Ġblue', '.', '</s>']