In [None]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm import tqdm
import evaluate
path = 'Data/splits'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+'/tokenized/train_ds/train_ds.arrow',
            'test':path+'/tokenized/test_ds/test_ds.arrow',
            'val':path+'/tokenized/val_ds/val_ds.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+'/train_ds/train_ds.arrow',
            'test':path+'/test_ds/test_ds.arrow',
            'validation':path+'/val_ds/val_ds.arrow'
        })
    return full_ds

### Train-Val-Test split, run only once

In [None]:
def preprocess_function(id,example,df):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+500] for i in range(0,len(tokens),500)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [None]:
def preprocess(df):
    new_df = {'text':[],'label':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        output = preprocess_function(i,row,df)
        for chunk,label,idx in output:
            if len(chunk) > 512: print("ERROR")
            new_df['text'].append(chunk)
            new_df['label'].append(mapping[label])
            ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [None]:
df = pd.read_pickle("Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

In [21]:
train_ds = Dataset.from_pandas(train_df,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (1/1 shards): 100%|██████████| 38822/38822 [00:00<00:00, 538477.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12319/12319 [00:00<00:00, 425667.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9005/9005 [00:00<00:00, 664877.70 examples/s]


### Load data

In [22]:
ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

Generating train split: 38822 examples [00:00, 487676.46 examples/s]
Generating test split: 12319 examples [00:00, 463795.76 examples/s]
Generating validation split: 9005 examples [00:00, 404772.29 examples/s]


### Decoding of tokens and getting attention maps via BERT Tokenizer

In [25]:
def decode_and_tokenize_function(example):
    tokens = example['text']
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    return tokenizer(text,padding=True)

In [26]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map: 100%|██████████| 38822/38822 [02:25<00:00, 266.58 examples/s]
Map: 100%|██████████| 12319/12319 [00:47<00:00, 261.91 examples/s]
Map: 100%|██████████| 9005/9005 [00:34<00:00, 262.05 examples/s]


In [27]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (1/1 shards): 100%|██████████| 38822/38822 [00:00<00:00, 304783.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9005/9005 [00:00<00:00, 257685.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12319/12319 [00:00<00:00, 263224.39 examples/s]


### Init Evaluation metrics, Data Collator and Model

In [28]:
tokenized_ds = load_data(path,tokenized=True)

Generating train split: 38822 examples [00:00, 285674.41 examples/s]
Generating test split: 12319 examples [00:00, 239046.36 examples/s]
Generating val split: 9005 examples [00:00, 351896.06 examples/s]


In [29]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/38822 [12:18<?, ?it/s]


In [31]:
trainer.train()

  1%|          | 228/38822 [00:41<2:06:31,  5.08it/s]

KeyboardInterrupt: 