In [125]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm import tqdm
import evaluate
path = 'Data/splits'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

True
NVIDIA GeForce RTX 2060
0


### Train-Val-Test split, run only once

In [120]:
def preprocess_function(id,example,df):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+510] for i in range(0,len(tokens),510)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [116]:
def preprocess(df):
    new_df = {'processed':[],'class':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        output = preprocess_function(i,row,df)
        for chunk,label,idx in output:
            new_df['processed'].append(chunk)
            new_df['class'].append(mapping[label])
            ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [117]:
df = pd.read_pickle("Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

26078it [02:07, 204.78it/s]
6520it [00:35, 183.66it/s]
8150it [00:38, 210.52it/s]


In [109]:
train_ds = Dataset.from_pandas(train_df,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (1/1 shards): 100%|██████████| 39081/39081 [00:00<00:00, 479530.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11308/11308 [00:00<00:00, 559557.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9382/9382 [00:00<00:00, 255789.81 examples/s]


### Load data

In [128]:
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+'/tokenized/train_ds/train_ds.arrow',
            'test':path+'/tokenized/test_ds/test_ds.arrow',
            'val':path+'/tokenized/val_ds/val_ds.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+'/train_ds/train_ds.arrow',
            'test':path+'/test_ds/test_ds.arrow',
            'validation':path+'/val_ds/val_ds.arrow'
        })
    return full_ds

ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

### Decoding of tokens and getting attention maps via BERT Tokenizer

In [113]:
def decode_and_tokenize_function(example):
    tokens = example['processed']
    text = tokenizer.decode(tokens)
    return tokenizer(text,padding=True)

In [119]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)

Map: 100%|██████████| 39081/39081 [02:40<00:00, 244.20 examples/s]


In [122]:
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map:   0%|          | 0/11308 [00:00<?, ? examples/s]

Map: 100%|██████████| 11308/11308 [00:41<00:00, 273.37 examples/s]
Map: 100%|██████████| 9382/9382 [00:35<00:00, 263.04 examples/s]


In [123]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (1/1 shards): 100%|██████████| 39081/39081 [00:00<00:00, 103684.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9382/9382 [00:00<00:00, 321347.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11308/11308 [00:00<00:00, 153947.89 examples/s]


In [129]:
tokenized_ds = load_data(path,tokenized=True)

Generating train split: 39081 examples [00:00, 205496.79 examples/s]
Generating test split: 11308 examples [00:00, 125610.62 examples/s]
Generating val split: 9382 examples [00:00, 134823.12 examples/s]


### Init Evaluation metrics, Data Collator and Model

In [126]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 839kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)