In [2]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm import tqdm
import evaluate
path = 'Data/splits'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

True
NVIDIA GeForce RTX 3080
0


In [3]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds


### Train-Val-Test split, run only once

In [4]:
def preprocess_function(id,example,df):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+500] for i in range(0,len(tokens),500)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [5]:
def preprocess(df):
    new_df = {'text':[],'label':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        output = preprocess_function(i,row,df)
        for chunk,label,idx in output:
            if len(chunk) > 512: print("ERROR")
            new_df['text'].append(chunk)
            new_df['label'].append(mapping[label])
            ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [5]:
df = pd.read_pickle("Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (606 > 512). Running this sequence through the model will result in indexing errors
26078it [00:26, 966.13it/s] 
6520it [00:06, 1057.94it/s]
8150it [00:07, 1045.12it/s]


In [21]:
train_ds = Dataset.from_pandas(train_df,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (1/1 shards): 100%|██████████| 38822/38822 [00:00<00:00, 538477.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12319/12319 [00:00<00:00, 425667.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9005/9005 [00:00<00:00, 664877.70 examples/s]


### Load data

In [9]:
ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

### Decoding of tokens and getting attention maps via BERT Tokenizer

In [12]:
def decode_and_tokenize_function(example):
    tokens = example['text']
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    return tokenizer(text,padding=True)

In [13]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map:   0%|          | 0/39541 [00:00<?, ? examples/s]

Map:   0%|          | 0/11542 [00:00<?, ? examples/s]

Map:   0%|          | 0/9063 [00:00<?, ? examples/s]

In [14]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (0/1 shards):   0%|          | 0/39541 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9063 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11542 [00:00<?, ? examples/s]

### Init Evaluation metrics, Data Collator and Model

In [6]:
tokenized_ds = load_data(path,tokenized=True)

Generating train split: 38822 examples [00:00, 285674.41 examples/s]
Generating test split: 12319 examples [00:00, 239046.36 examples/s]
Generating val split: 9005 examples [00:00, 351896.06 examples/s]


In [7]:
tokenized_ds['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 39541
})

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base

In [9]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    gradient_accumulation_steps=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    logging_steps=200,
    save_steps=200,
    eval_steps=200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/38822 [12:18<?, ?it/s]


In [10]:
trainer.train()



  0%|          | 0/3294 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.1446, 'learning_rate': 1.696417729204615e-05, 'epoch': 0.3}
{'loss': 0.0772, 'learning_rate': 1.392835458409229e-05, 'epoch': 0.61}
{'loss': 0.0681, 'learning_rate': 1.0892531876138435e-05, 'epoch': 0.91}


  0%|          | 0/1133 [00:00<?, ?it/s]

{'eval_loss': 0.0559488870203495, 'eval_accuracy': 0.9842215601897827, 'eval_runtime': 76.3828, 'eval_samples_per_second': 118.652, 'eval_steps_per_second': 14.833, 'epoch': 1.0}
{'loss': 0.0408, 'learning_rate': 7.856709168184579e-06, 'epoch': 1.21}
{'loss': 0.031, 'learning_rate': 4.820886460230723e-06, 'epoch': 1.52}
{'loss': 0.028, 'learning_rate': 1.7850637522768672e-06, 'epoch': 1.82}


  0%|          | 0/1133 [00:00<?, ?it/s]

{'eval_loss': 0.05624191835522652, 'eval_accuracy': 0.9866490124682776, 'eval_runtime': 77.8814, 'eval_samples_per_second': 116.369, 'eval_steps_per_second': 14.548, 'epoch': 2.0}
{'train_runtime': 2217.4504, 'train_samples_per_second': 35.663, 'train_steps_per_second': 1.485, 'train_loss': 0.061204043604347586, 'epoch': 2.0}


TrainOutput(global_step=3294, training_loss=0.061204043604347586, metrics={'train_runtime': 2217.4504, 'train_samples_per_second': 35.663, 'train_steps_per_second': 1.485, 'train_loss': 0.061204043604347586, 'epoch': 2.0})

In [11]:
eval_results = trainer.evaluate()

  0%|          | 0/1133 [00:00<?, ?it/s]

In [12]:
eval_results

{'eval_loss': 0.05624191835522652,
 'eval_accuracy': 0.9866490124682776,
 'eval_runtime': 77.9107,
 'eval_samples_per_second': 116.326,
 'eval_steps_per_second': 14.542,
 'epoch': 2.0}

In [None]:
trainer.save_model("./bert_model")