In [77]:
import torch
from datasets import Dataset, load_dataset
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
    print(torch.cuda.current_device())
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import evaluate
import random
import heapq
path = '/app/Data/'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

True
NVIDIA GeForce RTX 3080
0


In [78]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds


### Train-Val-Test split, run only once

In [16]:
def preprocess_function(id,example):
    text = example['processed']
    label = example['class']
    tokens = tokenizer.encode(text)
    out = []
    if len(tokens) <= 512:
        out.append([tokens,label,id])
        return out
    else:
        cls_token = tokens[0]
        sep_token = tokens[-1]
        tokens = tokens[1:-1] # remove CLS and SEP tokens
        chunks = [tokens[i:i+500] for i in range(0,len(tokens),500)]
        for c in chunks: # add back CLS and SEP tokens
            c.insert(0,cls_token)
            c.append(sep_token)
            out.append([c,label,id])
        return out


In [17]:
def shorten(text):
    words = text.split(" ")
    curr = None
    final = []
    for word in words:
        if word != curr:
            final.append(word)
        curr = word
    return " ".join(final)

In [18]:
def preprocess_function_2(id,example):
    text = example['processed']
    text = shorten(text)
    label = example['class']
    tokens = tokenizer.encode(text)
    cls_token = tokens[0]
    sep_token = tokens[-1]
    first_chunk = tokens[1: 511]
    c = []
    c.append(cls_token)
    c.extend(first_chunk)
    c.append(sep_token)
    return([c, label, id])

In [19]:
def preprocess(df):
    new_df = {'text':[],'label':[],'raw_text':[]}
    ids = []
    mapping = {"ham":0,"spam":1}
    for i,row in tqdm(df.iterrows()):
        new_df["raw_text"].append(row['processed'])
        tokens, label, idx = preprocess_function_2(i,row)
        new_df['text'].append(tokens)
        new_df['label'].append(mapping[label])
        ids.append(idx)
    final_df = pd.DataFrame(new_df,index=ids)
    return final_df

In [20]:
seed = random.seed(37)
df = pd.read_pickle("/app/Data/full_df.pkl")
df = df.drop_duplicates(subset=['processed'])
X = df['processed']
y = df['class']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_tr,y_tr,test_size=0.2)
train_df = pd.DataFrame({'processed':X_train,'class':y_train})
val_df = pd.DataFrame({'processed':X_val,'class':y_val})
test_df = pd.DataFrame({'processed':X_test,'class':y_test})
train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
26078it [00:35, 744.49it/s]
6520it [00:09, 680.85it/s]
8150it [00:11, 694.33it/s]


In [10]:
sentences = list(df.processed)
ids = df.index
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=torch.cuda.current_device())

In [11]:
embeddings = model.encode(sentences)

In [12]:
embedding_df = pd.DataFrame({"embedding": [embeddings[i] for i in range(len(ids))]}, index=list(ids))

In [13]:
embedding_df.to_pickle("/app/Data/embedding.pkl")

In [13]:
embedding_df = pd.read_pickle("/app/Data/embedding.pkl")

In [28]:
import jax
import jax.numpy as jnp
if torch.cuda.is_available():
    jax_device = jax.devices("gpu")[0]
else:
    jax_device = jax.devices("cpu")[0]

In [22]:
def cosine_sim(x,y):
    '''
    Computes cosine similarity between b and each row of a
    x: 2d np vector
    y: 1d np vector
    '''
    dot_product = jnp.dot(x, y)
    
    # Compute the magnitudes of x and y
    x_norm = jnp.linalg.norm(x)
    y_norm = jnp.linalg.norm(y)
    
    # Compute the cosine similarity
    similarity = dot_product / (x_norm * y_norm)
    
    return similarity
cosine_sim_jit = jax.jit(device=jax_device, fun=cosine_sim)

In [65]:
class Post(object):
    def __init__(self, embedding, pos):
        self.embedding = embedding
        self.closest_dst = -1 # Cosine similarity is a value from -1 to 1, with similar posts having value close to 1
        self.pos = pos # Position in self.posts

    def __lt__(self, other):
        return self.closest_dst < other.closest_dst

class Undersample:
    def __init__(self, df):
        self.df = df
        self.post_ids = list(df.index)
        self.distances = jnp.array([-1] * len(self.post_ids))
        self.embeddings = np.row_stack(list(df.embedding))
        self.embeddings = jax.device_put(jnp.array(self.embeddings), device=jax_device)
        self.selected_ids = []
        self.recent = None
    
    def select_furthest(self):
        if len(self.selected_ids) == 0:
            # Pick random starting point
            recent_pos = random.randint(0, len(self.post_ids)-1)
            self.selected_ids.append(self.post_ids[recent_pos])
            self.recent = self.embeddings[recent_pos]
            return 
        dist_to_recent = cosine_sim_jit(self.embeddings, self.recent)
        self.distances = jnp.maximum(self.distances, dist_to_recent)
        recent_pos = jnp.argmin(self.distances)
        self.selected_ids.append(self.post_ids[recent_pos])
        self.recent = self.embeddings[recent_pos]
        return

    def select_n(self, n):
        if n>len(self.post_ids):
            return self.df
        for i in tqdm(range(n)):
            self.select_furthest()
        return self.df[self.df.index.isin(self.selected_ids)]

In [24]:
train_spam = train_df[train_df.label==1].copy()
train_ham = train_df[train_df.label==0].copy()
train_ham_embed = pd.merge(train_ham, embedding_df, left_index=True, right_index=True)
train_df.groupby(["label"]).count()

Unnamed: 0_level_0,text,raw_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19745,19745
1,6333,6333


In [66]:
undersampler = Undersample(train_ham_embed)
train_ham_sampled = undersampler.select_n(len(train_spam))

100%|██████████| 6333/6333 [00:18<00:00, 340.94it/s]


In [81]:
train_new = pd.concat([train_spam[["text", "label"]], train_ham_sampled[["text", "label"]]])
val_df = val_df[["text", "label"]]
test_df = test_df[["text", "label"]]

In [82]:
train_ds = Dataset.from_pandas(train_new,split='train')
val_ds = Dataset.from_pandas(val_df,split='validation')
test_ds = Dataset.from_pandas(test_df,split='test')
train_ds.save_to_disk(path+'/train_ds')
test_ds.save_to_disk(path+'/test_ds')
val_ds.save_to_disk(path+'/val_ds')

Saving the dataset (1/1 shards): 100%|██████████| 12666/12666 [00:00<00:00, 333143.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8150/8150 [00:00<00:00, 295279.12 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6520/6520 [00:00<00:00, 263457.25 examples/s]


### Load data

In [83]:
ds = load_data(path)
train_ds = ds['train']
test_ds = ds['test']
val_ds = ds['validation']
train_ds = train_ds.remove_columns(['__index_level_0__'])
test_ds = test_ds.remove_columns(['__index_level_0__'])
val_ds = val_ds.remove_columns(['__index_level_0__'])

Generating train split: 12666 examples [00:00, 339743.77 examples/s]
Generating test split: 8150 examples [00:00, 378082.55 examples/s]
Generating validation split: 6520 examples [00:00, 334792.58 examples/s]


### Decoding of tokens and getting attention maps via BERT Tokenizer

In [84]:
def decode_and_tokenize_function(example):
    tokens = example['text']
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    return tokenizer(text,padding=True)

In [85]:
tokenized_train_ds = train_ds.map(decode_and_tokenize_function)
tokenized_test_ds = test_ds.map(decode_and_tokenize_function)
tokenized_val_ds = val_ds.map(decode_and_tokenize_function)

Map: 100%|██████████| 12666/12666 [00:28<00:00, 440.51 examples/s]
Map: 100%|██████████| 8150/8150 [00:19<00:00, 417.80 examples/s]
Map: 100%|██████████| 6520/6520 [00:15<00:00, 413.05 examples/s]


In [86]:
tokenized_train_ds.save_to_disk(path+'/tokenized/train_ds')
tokenized_val_ds.save_to_disk(path+'/tokenized/val_ds')
tokenized_test_ds.save_to_disk(path+'/tokenized/test_ds')

Saving the dataset (1/1 shards): 100%|██████████| 12666/12666 [00:00<00:00, 100646.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6520/6520 [00:00<00:00, 172687.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8150/8150 [00:00<00:00, 151182.52 examples/s]
