In [22]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import evaluate
import random
import heapq
path = 'Data/splits'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

True
NVIDIA GeForce RTX 3080
0


In [24]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds


### Train-Val-Test split, run only once

In [40]:
def preprocess_function(id,example):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+500] for i in range(0,len(tokens),500)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [50]:
def shorten(text):
    words = text.split(" ")
    curr = None
    final = []
    for word in words:
        if word != curr:
            final.append(word)
        curr = word
    return " ".join(final)

In [54]:
def preprocess_function_2(id,example):
    text = example['processed']
    text = shorten(text)
    label = example['class']
    tokens = tokenizer.encode(text)
    cls_token = tokens[0]
    sep_token = tokens[-1]
    first_chunk = tokens[1: 511]
    c = []
    c.append(cls_token)
    c.extend(first_chunk)
    c.append(sep_token)
    return([c, label, id])

In [56]:
def preprocess(df):
    new_df = {'text':[],'label':[],'raw_text':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        new_df["raw_text"].append(row['processed'])
        tokens, label, idx = preprocess_function_2(i,row)
        new_df['text'].append(tokens)
        new_df['label'].append(mapping[label])
        ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [57]:
seed = random.seed(37)
df = pd.read_pickle("Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

26078it [00:32, 805.59it/s] 
6520it [00:07, 903.99it/s] 
8150it [00:09, 836.10it/s] 


In [58]:
train_df.head()

Unnamed: 0,text,label,raw_text
trec06p/data/120/299,"[101, 18720, 3501, 2615, 27922, 2243, 1051, 68...",0,pgjvzhk oci phrkignv cjwvdgfibgu ...
trec06c/data/153/236,"[101, 7592, 6203, 2449, 2814, 1045, 2572, 2013...",1,hello dear business friends i am from shenzhen...
trec06p/data/087/127,"[101, 14158, 1996, 6671, 1997, 8714, 16371, 22...",0,observing the transit of mercury num may num b...
trec06p/data/065/177,"[101, 1037, 16371, 2213, 6187, 2050, 16371, 22...",1,a num caa num caa num caa num caa num caa num ...
trec06p/data/111/164,"[101, 2017, 2342, 2000, 16500, 18830, 5004, 20...",0,you need to install cflow then see link im a ...


In [59]:
train_df.groupby(["label"]).count()

Unnamed: 0_level_0,text,raw_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19707,19707
1,6371,6371


In [23]:
def cosine_sim(a,b):
    '''
    Computes cosine similarity between b and each row of a
    a: 2d np vector
    b: 1d np vector
    '''
    a_norm = np.linalg.norm(a, axis=1)
    b_norm = np.linalg.norm(b)
    return (a @ b) / (a_norm * b_norm)

In [None]:
class Post(object):
    def __init__(self, id, embedding):
        self.id = id
        self.embedding = embedding
        self.closest_dst = -1 # Cosine similarity is a value from -1 to 1, with similar posts having value close to 1

    def __lt__(self, other):
        return self.closest_dst < other.closest_dst

class Undersample:
    def __init__(self, df):
        self.df = df
        self.posts = [Post(row["id"], row["embedding"]) for index, row in df.iterrows()]
        self.heap = [post for post in self.posts]
        heapq.heapify(self.heap)
        self.selected_ids = []
        self.selected_embeddings = None
    
    def select_furthest(self):
        if len(self.selected_ids) == 0:
            selected = heapq.heappop(self.heap)
            return selected
        checked = []
        heapq.heapify(checked)
        furthest_value = 10
        furthest_post = None
        while len(self.heap) != 0:
            post = heapq.heappop(self.heap)
            if post.closest_dst > furthest_value:
                heapq.heappush(self.heap, post)
                temp = list(heapq.merge(self.heap, checked))
                heapq.heapify(temp)
                self.heap=temp
                return furthest_post
            x = max(cosine_sim(self.selected_embeddings, post.embedding))
            post.closest_dst = x
            if x < furthest_value:
                furthest_value = x
                if furthest_post is not None:
                    heapq.heappush(checked, furthest_post)
                furthest_post = post
            else:
                heapq.heappush(checked, post)
        self.heap = checked
        return furthest_post            

    def select_n(self, n):
        if n>len(self.posts):
            return self.posts
        for i in tqdm(range(n)):
            selected = self.select_furthest()
            reshaped = np.reshape(selected.embedding, (1,len(selected.embedding)))
            if self.selected_embeddings is None:
                self.selected_embeddings = reshaped
            else:
                self.selected_embeddings = np.row_stack([self.selected_embeddings, reshaped])
            self.selected_ids.append(selected.id)
        return self.df[df.id.isin(self.selected_ids)].copy()

In [60]:
train_spam = train_df[train_df.label==1].copy()
train_ham = train_df[train_df.label==0].copy()
undersample_ham = train_ham.sample(len(train_spam), random_state=seed, replace=False)
train_df = pd.concat([undersample_ham, train_spam])

In [61]:
train_ds = Dataset.from_pandas(train_df,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (0/1 shards):   0%|          | 0/12742 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8150 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6520 [00:00<?, ? examples/s]

### Load data

In [62]:
ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

### Decoding of tokens and getting attention maps via BERT Tokenizer

In [63]:
def decode_and_tokenize_function(example):
    tokens = example['text']
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    return tokenizer(text,padding=True)

In [64]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map:   0%|          | 0/12742 [00:00<?, ? examples/s]

Map:   0%|          | 0/8150 [00:00<?, ? examples/s]

Map:   0%|          | 0/6520 [00:00<?, ? examples/s]

In [65]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (0/1 shards):   0%|          | 0/12742 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6520 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8150 [00:00<?, ? examples/s]

### Init Evaluation metrics, Data Collator and Model

In [66]:
tokenized_ds = load_data(path,tokenized=True)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [67]:
tokenized_ds['train']

Dataset({
    features: ['text', 'label', 'raw_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12742
})

In [83]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    optim="adamw_bnb_8bit",
    gradient_accumulation_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="steps",
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    logging_steps=200,
    save_steps=200,
    eval_steps=200,
    save_total_limit=5,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [87]:
trainer.train()

  0%|          | 0/3184 [00:00<?, ?it/s]

{'loss': 0.1321, 'grad_norm': 0.1225442886352539, 'learning_rate': 1.8743718592964826e-05, 'epoch': 0.25}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.11306805908679962, 'eval_accuracy': 0.9664110429447853, 'eval_runtime': 55.3295, 'eval_samples_per_second': 117.84, 'eval_steps_per_second': 14.73, 'epoch': 0.25}
{'loss': 0.1236, 'grad_norm': 9.54425048828125, 'learning_rate': 1.748743718592965e-05, 'epoch': 0.5}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.0986725389957428, 'eval_accuracy': 0.975, 'eval_runtime': 56.1346, 'eval_samples_per_second': 116.149, 'eval_steps_per_second': 14.519, 'epoch': 0.5}
{'loss': 0.083, 'grad_norm': 26.82895278930664, 'learning_rate': 1.6231155778894474e-05, 'epoch': 0.75}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.087204709649086, 'eval_accuracy': 0.9780674846625766, 'eval_runtime': 53.7326, 'eval_samples_per_second': 121.342, 'eval_steps_per_second': 15.168, 'epoch': 0.75}
{'loss': 0.078, 'grad_norm': 0.3467020094394684, 'learning_rate': 1.4974874371859299e-05, 'epoch': 1.0}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.11468417942523956, 'eval_accuracy': 0.9739263803680982, 'eval_runtime': 53.6444, 'eval_samples_per_second': 121.541, 'eval_steps_per_second': 15.193, 'epoch': 1.0}
{'loss': 0.0358, 'grad_norm': 0.015922775492072105, 'learning_rate': 1.3718592964824123e-05, 'epoch': 1.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09464883804321289, 'eval_accuracy': 0.9799079754601226, 'eval_runtime': 53.6889, 'eval_samples_per_second': 121.44, 'eval_steps_per_second': 15.18, 'epoch': 1.26}
{'loss': 0.0354, 'grad_norm': 0.013116507790982723, 'learning_rate': 1.2462311557788947e-05, 'epoch': 1.51}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.0857028067111969, 'eval_accuracy': 0.9825153374233129, 'eval_runtime': 53.6629, 'eval_samples_per_second': 121.499, 'eval_steps_per_second': 15.187, 'epoch': 1.51}
{'loss': 0.041, 'grad_norm': 0.05498252436518669, 'learning_rate': 1.120603015075377e-05, 'epoch': 1.76}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09343696385622025, 'eval_accuracy': 0.9809815950920245, 'eval_runtime': 53.6121, 'eval_samples_per_second': 121.614, 'eval_steps_per_second': 15.202, 'epoch': 1.76}
{'loss': 0.0334, 'grad_norm': 0.018323533236980438, 'learning_rate': 9.949748743718594e-06, 'epoch': 2.01}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.08328451216220856, 'eval_accuracy': 0.9831288343558282, 'eval_runtime': 53.6436, 'eval_samples_per_second': 121.543, 'eval_steps_per_second': 15.193, 'epoch': 2.01}
{'loss': 0.0156, 'grad_norm': 0.008698437362909317, 'learning_rate': 8.693467336683418e-06, 'epoch': 2.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.08576551824808121, 'eval_accuracy': 0.9837423312883435, 'eval_runtime': 53.6895, 'eval_samples_per_second': 121.439, 'eval_steps_per_second': 15.18, 'epoch': 2.26}
{'loss': 0.0132, 'grad_norm': 0.005109351594001055, 'learning_rate': 7.437185929648242e-06, 'epoch': 2.51}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09594490379095078, 'eval_accuracy': 0.9823619631901841, 'eval_runtime': 53.5878, 'eval_samples_per_second': 121.669, 'eval_steps_per_second': 15.209, 'epoch': 2.51}
{'loss': 0.0141, 'grad_norm': 0.7692071795463562, 'learning_rate': 6.180904522613066e-06, 'epoch': 2.76}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09207481890916824, 'eval_accuracy': 0.9829754601226994, 'eval_runtime': 53.6439, 'eval_samples_per_second': 121.542, 'eval_steps_per_second': 15.193, 'epoch': 2.76}
{'loss': 0.0136, 'grad_norm': 0.0072205448523163795, 'learning_rate': 4.92462311557789e-06, 'epoch': 3.01}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.10394727438688278, 'eval_accuracy': 0.9812883435582822, 'eval_runtime': 53.6334, 'eval_samples_per_second': 121.566, 'eval_steps_per_second': 15.196, 'epoch': 3.01}
{'loss': 0.0057, 'grad_norm': 1.2677863836288452, 'learning_rate': 3.6683417085427137e-06, 'epoch': 3.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09566265344619751, 'eval_accuracy': 0.9835889570552148, 'eval_runtime': 53.7206, 'eval_samples_per_second': 121.369, 'eval_steps_per_second': 15.171, 'epoch': 3.26}
{'loss': 0.0028, 'grad_norm': 0.001682559261098504, 'learning_rate': 2.412060301507538e-06, 'epoch': 3.52}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09360352158546448, 'eval_accuracy': 0.9842024539877301, 'eval_runtime': 53.6615, 'eval_samples_per_second': 121.502, 'eval_steps_per_second': 15.188, 'epoch': 3.52}
{'loss': 0.0031, 'grad_norm': 0.001572438981384039, 'learning_rate': 1.155778894472362e-06, 'epoch': 3.77}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.1002873033285141, 'eval_accuracy': 0.9845092024539878, 'eval_runtime': 53.6078, 'eval_samples_per_second': 121.624, 'eval_steps_per_second': 15.203, 'epoch': 3.77}
{'train_runtime': 2079.8779, 'train_samples_per_second': 24.505, 'train_steps_per_second': 1.531, 'train_loss': 0.03964742737349553, 'epoch': 4.0}


TrainOutput(global_step=3184, training_loss=0.03964742737349553, metrics={'train_runtime': 2079.8779, 'train_samples_per_second': 24.505, 'train_steps_per_second': 1.531, 'train_loss': 0.03964742737349553, 'epoch': 4.0})

In [88]:
eval_results = trainer.evaluate()

  0%|          | 0/815 [00:00<?, ?it/s]

In [89]:
eval_results

{'eval_loss': 0.08328451216220856,
 'eval_accuracy': 0.9831288343558282,
 'eval_runtime': 55.2077,
 'eval_samples_per_second': 118.099,
 'eval_steps_per_second': 14.762,
 'epoch': 4.0}

In [90]:
trainer.save_model("models/bert_model5")

In [91]:
tokenized_ds = load_data(path,tokenized=True)
tokenized_test_ds = tokenized_ds['test']
classifier = pipeline('text-classification',model='models/bert_model5', device=torch.cuda.current_device())
def decode_tokens(example):
    tokens = example['text']
    # label_map = {0:"ham",1:"spam"}
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    # label = label_map[example['label']]
    return {'text':text}

tokenized_test_ds = tokenized_test_ds.map(decode_tokens)

In [92]:
from evaluate import evaluator
task_evaluator = evaluator('text-classification')
results = task_evaluator.compute(
    model_or_pipeline=classifier,
    data=tokenized_test_ds,
    metric=evaluate.combine(['accuracy','recall','precision','f1']),
    label_mapping=label2id,
)

In [93]:
results

{'accuracy': 0.9846625766871165,
 'recall': 0.9693928750627195,
 'precision': 0.9679358717434869,
 'f1': 0.9686638255201804,
 'total_time_in_seconds': 92.62441260000196,
 'samples_per_second': 87.98976178338353,
 'latency_in_seconds': 0.011364958601227234}