In [77]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
    print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import evaluate
import random
import heapq
path = '/app/Data/'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

True
NVIDIA GeForce RTX 3080
0


In [78]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds


### Train-Val-Test split, run only once

In [16]:
def preprocess_function(id,example):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+500] for i in range(0,len(tokens),500)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [17]:
def shorten(text):
    words = text.split(" ")
    curr = None
    final = []
    for word in words:
        if word != curr:
            final.append(word)
        curr = word
    return " ".join(final)

In [18]:
def preprocess_function_2(id,example):
    text = example['processed']
    text = shorten(text)
    label = example['class']
    tokens = tokenizer.encode(text)
    cls_token = tokens[0]
    sep_token = tokens[-1]
    first_chunk = tokens[1: 511]
    c = []
    c.append(cls_token)
    c.extend(first_chunk)
    c.append(sep_token)
    return([c, label, id])

In [19]:
def preprocess(df):
    new_df = {'text':[],'label':[],'raw_text':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        new_df["raw_text"].append(row['processed'])
        tokens, label, idx = preprocess_function_2(i,row)
        new_df['text'].append(tokens)
        new_df['label'].append(mapping[label])
        ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [20]:
seed = random.seed(37)
df = pd.read_pickle("/app/Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
26078it [00:35, 744.49it/s]
6520it [00:09, 680.85it/s]
8150it [00:11, 694.33it/s]


In [10]:
sentences = list(df.processed)
ids = df.index
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=torch.cuda.current_device())

In [11]:
embeddings = model.encode(sentences)

In [12]:
embedding_df = pd.DataFrame({"embedding": [embeddings[i] for i in range(len(ids))]}, index=list(ids))

In [13]:
embedding_df.to_pickle("/app/Data/embedding.pkl")

In [13]:
embedding_df = pd.read_pickle("/app/Data/embedding.pkl")

In [28]:
import jax
import jax.numpy as jnp
if torch.cuda.is_available():
    jax_device = jax.devices("gpu")[0]
else:
    jax_device = jax.devices("cpu")[0]

In [22]:
def cosine_sim(x,y):
    '''
    Computes cosine similarity between b and each row of a
    x: 2d np vector
    y: 1d np vector
    '''
    dot_product = jnp.dot(x, y)
    
    # Compute the magnitudes of x and y
    x_norm = jnp.linalg.norm(x)
    y_norm = jnp.linalg.norm(y)
    
    # Compute the cosine similarity
    similarity = dot_product / (x_norm * y_norm)
    
    return similarity
cosine_sim_jit = jax.jit(device=jax_device, fun=cosine_sim)

In [65]:
class Post(object):
    def __init__(self, embedding, pos):
        self.embedding = embedding
        self.closest_dst = -1 # Cosine similarity is a value from -1 to 1, with similar posts having value close to 1
        self.pos = pos # Position in self.posts

    def __lt__(self, other):
        return self.closest_dst < other.closest_dst

class Undersample:
    def __init__(self, df):
        self.df = df
        self.post_ids = list(df.index)
        self.distances = jnp.array([-1] * len(self.post_ids))
        self.embeddings = np.row_stack(list(df.embedding))
        self.embeddings = jax.device_put(jnp.array(self.embeddings), device=jax_device)
        self.selected_ids = []
        self.recent = None
    
    def select_furthest(self):
        if len(self.selected_ids) == 0:
            # Pick random starting point
            recent_pos = random.randint(0, len(self.post_ids)-1)
            self.selected_ids.append(self.post_ids[recent_pos])
            self.recent = self.embeddings[recent_pos]
            return 
        dist_to_recent = cosine_sim_jit(self.embeddings, self.recent)
        self.distances = jnp.maximum(self.distances, dist_to_recent)
        recent_pos = jnp.argmin(self.distances)
        self.selected_ids.append(self.post_ids[recent_pos])
        self.recent = self.embeddings[recent_pos]
        return

    def select_n(self, n):
        if n>len(self.post_ids):
            return self.df
        for i in tqdm(range(n)):
            self.select_furthest()
        return self.df[self.df.index.isin(self.selected_ids)]

In [24]:
train_spam = train_df[train_df.label==1].copy()
train_ham = train_df[train_df.label==0].copy()
train_ham_embed = pd.merge(train_ham, embedding_df, left_index=True, right_index=True)
train_df.groupby(["label"]).count()

Unnamed: 0_level_0,text,raw_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19745,19745
1,6333,6333


In [66]:
undersampler = Undersample(train_ham_embed)
train_ham_sampled = undersampler.select_n(len(train_spam))

100%|██████████| 6333/6333 [00:18<00:00, 340.94it/s]


In [81]:
train_new = pd.concat([train_spam[["text", "label"]], train_ham_sampled[["text", "label"]]])
val_df = val_df[["text", "label"]]
test_df = test_df[["text", "label"]]

In [82]:
train_ds = Dataset.from_pandas(train_new,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (1/1 shards): 100%|██████████| 12666/12666 [00:00<00:00, 333143.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8150/8150 [00:00<00:00, 295279.12 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6520/6520 [00:00<00:00, 263457.25 examples/s]


### Load data

In [83]:
ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

Generating train split: 12666 examples [00:00, 339743.77 examples/s]
Generating test split: 8150 examples [00:00, 378082.55 examples/s]
Generating validation split: 6520 examples [00:00, 334792.58 examples/s]


### Decoding of tokens and getting attention maps via BERT Tokenizer

In [84]:
def decode_and_tokenize_function(example):
    tokens = example['text']
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    return tokenizer(text,padding=True)

In [85]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map: 100%|██████████| 12666/12666 [00:28<00:00, 440.51 examples/s]
Map: 100%|██████████| 8150/8150 [00:19<00:00, 417.80 examples/s]
Map: 100%|██████████| 6520/6520 [00:15<00:00, 413.05 examples/s]


In [86]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (1/1 shards): 100%|██████████| 12666/12666 [00:00<00:00, 100646.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6520/6520 [00:00<00:00, 172687.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8150/8150 [00:00<00:00, 151182.52 examples/s]


### Init Evaluation metrics, Data Collator and Model

In [66]:
tokenized_ds = load_data(path,tokenized=True)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [67]:
tokenized_ds['train']

Dataset({
    features: ['text', 'label', 'raw_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12742
})

In [83]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    optim="adamw_bnb_8bit",
    gradient_accumulation_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="steps",
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    logging_steps=200,
    save_steps=200,
    eval_steps=200,
    save_total_limit=5,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [87]:
trainer.train()

  0%|          | 0/3184 [00:00<?, ?it/s]

{'loss': 0.1321, 'grad_norm': 0.1225442886352539, 'learning_rate': 1.8743718592964826e-05, 'epoch': 0.25}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.11306805908679962, 'eval_accuracy': 0.9664110429447853, 'eval_runtime': 55.3295, 'eval_samples_per_second': 117.84, 'eval_steps_per_second': 14.73, 'epoch': 0.25}
{'loss': 0.1236, 'grad_norm': 9.54425048828125, 'learning_rate': 1.748743718592965e-05, 'epoch': 0.5}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.0986725389957428, 'eval_accuracy': 0.975, 'eval_runtime': 56.1346, 'eval_samples_per_second': 116.149, 'eval_steps_per_second': 14.519, 'epoch': 0.5}
{'loss': 0.083, 'grad_norm': 26.82895278930664, 'learning_rate': 1.6231155778894474e-05, 'epoch': 0.75}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.087204709649086, 'eval_accuracy': 0.9780674846625766, 'eval_runtime': 53.7326, 'eval_samples_per_second': 121.342, 'eval_steps_per_second': 15.168, 'epoch': 0.75}
{'loss': 0.078, 'grad_norm': 0.3467020094394684, 'learning_rate': 1.4974874371859299e-05, 'epoch': 1.0}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.11468417942523956, 'eval_accuracy': 0.9739263803680982, 'eval_runtime': 53.6444, 'eval_samples_per_second': 121.541, 'eval_steps_per_second': 15.193, 'epoch': 1.0}
{'loss': 0.0358, 'grad_norm': 0.015922775492072105, 'learning_rate': 1.3718592964824123e-05, 'epoch': 1.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09464883804321289, 'eval_accuracy': 0.9799079754601226, 'eval_runtime': 53.6889, 'eval_samples_per_second': 121.44, 'eval_steps_per_second': 15.18, 'epoch': 1.26}
{'loss': 0.0354, 'grad_norm': 0.013116507790982723, 'learning_rate': 1.2462311557788947e-05, 'epoch': 1.51}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.0857028067111969, 'eval_accuracy': 0.9825153374233129, 'eval_runtime': 53.6629, 'eval_samples_per_second': 121.499, 'eval_steps_per_second': 15.187, 'epoch': 1.51}
{'loss': 0.041, 'grad_norm': 0.05498252436518669, 'learning_rate': 1.120603015075377e-05, 'epoch': 1.76}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09343696385622025, 'eval_accuracy': 0.9809815950920245, 'eval_runtime': 53.6121, 'eval_samples_per_second': 121.614, 'eval_steps_per_second': 15.202, 'epoch': 1.76}
{'loss': 0.0334, 'grad_norm': 0.018323533236980438, 'learning_rate': 9.949748743718594e-06, 'epoch': 2.01}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.08328451216220856, 'eval_accuracy': 0.9831288343558282, 'eval_runtime': 53.6436, 'eval_samples_per_second': 121.543, 'eval_steps_per_second': 15.193, 'epoch': 2.01}
{'loss': 0.0156, 'grad_norm': 0.008698437362909317, 'learning_rate': 8.693467336683418e-06, 'epoch': 2.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.08576551824808121, 'eval_accuracy': 0.9837423312883435, 'eval_runtime': 53.6895, 'eval_samples_per_second': 121.439, 'eval_steps_per_second': 15.18, 'epoch': 2.26}
{'loss': 0.0132, 'grad_norm': 0.005109351594001055, 'learning_rate': 7.437185929648242e-06, 'epoch': 2.51}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09594490379095078, 'eval_accuracy': 0.9823619631901841, 'eval_runtime': 53.5878, 'eval_samples_per_second': 121.669, 'eval_steps_per_second': 15.209, 'epoch': 2.51}
{'loss': 0.0141, 'grad_norm': 0.7692071795463562, 'learning_rate': 6.180904522613066e-06, 'epoch': 2.76}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09207481890916824, 'eval_accuracy': 0.9829754601226994, 'eval_runtime': 53.6439, 'eval_samples_per_second': 121.542, 'eval_steps_per_second': 15.193, 'epoch': 2.76}
{'loss': 0.0136, 'grad_norm': 0.0072205448523163795, 'learning_rate': 4.92462311557789e-06, 'epoch': 3.01}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.10394727438688278, 'eval_accuracy': 0.9812883435582822, 'eval_runtime': 53.6334, 'eval_samples_per_second': 121.566, 'eval_steps_per_second': 15.196, 'epoch': 3.01}
{'loss': 0.0057, 'grad_norm': 1.2677863836288452, 'learning_rate': 3.6683417085427137e-06, 'epoch': 3.26}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09566265344619751, 'eval_accuracy': 0.9835889570552148, 'eval_runtime': 53.7206, 'eval_samples_per_second': 121.369, 'eval_steps_per_second': 15.171, 'epoch': 3.26}
{'loss': 0.0028, 'grad_norm': 0.001682559261098504, 'learning_rate': 2.412060301507538e-06, 'epoch': 3.52}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.09360352158546448, 'eval_accuracy': 0.9842024539877301, 'eval_runtime': 53.6615, 'eval_samples_per_second': 121.502, 'eval_steps_per_second': 15.188, 'epoch': 3.52}
{'loss': 0.0031, 'grad_norm': 0.001572438981384039, 'learning_rate': 1.155778894472362e-06, 'epoch': 3.77}


  0%|          | 0/815 [00:00<?, ?it/s]

{'eval_loss': 0.1002873033285141, 'eval_accuracy': 0.9845092024539878, 'eval_runtime': 53.6078, 'eval_samples_per_second': 121.624, 'eval_steps_per_second': 15.203, 'epoch': 3.77}
{'train_runtime': 2079.8779, 'train_samples_per_second': 24.505, 'train_steps_per_second': 1.531, 'train_loss': 0.03964742737349553, 'epoch': 4.0}


TrainOutput(global_step=3184, training_loss=0.03964742737349553, metrics={'train_runtime': 2079.8779, 'train_samples_per_second': 24.505, 'train_steps_per_second': 1.531, 'train_loss': 0.03964742737349553, 'epoch': 4.0})

In [88]:
eval_results = trainer.evaluate()

  0%|          | 0/815 [00:00<?, ?it/s]

In [89]:
eval_results

{'eval_loss': 0.08328451216220856,
 'eval_accuracy': 0.9831288343558282,
 'eval_runtime': 55.2077,
 'eval_samples_per_second': 118.099,
 'eval_steps_per_second': 14.762,
 'epoch': 4.0}

In [90]:
trainer.save_model("models/bert_model5")

In [91]:
tokenized_ds = load_data(path,tokenized=True)
tokenized_test_ds = tokenized_ds['test']
classifier = pipeline('text-classification',model='models/bert_model5', device=torch.cuda.current_device())
def decode_tokens(example):
    tokens = example['text']
    # label_map = {0:"ham",1:"spam"}
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    # label = label_map[example['label']]
    return {'text':text}

tokenized_test_ds = tokenized_test_ds.map(decode_tokens)

In [92]:
from evaluate import evaluator
task_evaluator = evaluator('text-classification')
results = task_evaluator.compute(
    model_or_pipeline=classifier,
    data=tokenized_test_ds,
    metric=evaluate.combine(['accuracy','recall','precision','f1']),
    label_mapping=label2id,
)

In [93]:
results

{'accuracy': 0.9846625766871165,
 'recall': 0.9693928750627195,
 'precision': 0.9679358717434869,
 'f1': 0.9686638255201804,
 'total_time_in_seconds': 92.62441260000196,
 'samples_per_second': 87.98976178338353,
 'latency_in_seconds': 0.011364958601227234}