# Prprocess Dataset: text to embedding
- ref: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
# import package
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, BertModel, BertTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm

In [None]:
# set random seed
np.random.seed(42)

## Dataset

In [None]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

In [None]:
# EDA

# check NaN values
print(f"Null values in training data:")
print(train_df.isnull().sum())
# print unique labels
print(f"Unique labels in training data:")
print(train_df['label'].unique())
# find the row that label == 'label'
print(f"Rows with label 'label':")
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

In [None]:
# reset the index
train_df.reset_index(drop=True, inplace=True)

# info
print(f"train info:")
print(train_df.info())
print(train_df.head())

In [None]:
print(f"test info:")
print(test_df.info())
print(test_df.head())

In [None]:
# train validation split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

## Tokenizer
At this stage, we transform the text data into embeddings to later feed into the model. 

We choose the `distilbert-base-uncased` model on Hugging Face for this task.

In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
# tokenize the text
train_encodings = tokenizer(train_df['text'].tolist(), max_length=512, truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), max_length=512, truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), max_length=512, truncation=True, padding=True)

In [None]:
# After tokenization, the texts are converted to input IDs and attention masks
print(train_encodings.keys())

In [None]:
print(train_encodings['input_ids'][0])

print(train_encodings['attention_mask'][0])

# Generate Embeddings

In [None]:
model = BertModel.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# move the model to the device
model = model.to(device)

In [None]:
# create a dataset class
class TokenDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.encodings['attention_mask'][idx])
        labels = self.labels[idx]
        return input_ids, attention_mask, labels

    def __len__(self):
        return len(self.labels)

In [None]:
# create dataset objects
train_dataset = TokenDataset(train_encodings, train_df['label'].tolist())
val_dataset = TokenDataset(val_encodings, val_df['label'].tolist())
test_dataset = TokenDataset(test_encodings, np.zeros(test_df.shape[0]))

In [None]:
# create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)

print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

In [None]:
def encodings_to_embeddings(loader, model):
    model.eval()
    df = pd.DataFrame()
    embeddings_record = []
    labels_record = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            last_hidden_states = outputs.last_hidden_state
            embeddings = last_hidden_states
            
            embeddings = torch.mean(embeddings, dim=1)
            
            embeddings_record.extend(embeddings.cpu().numpy())
            labels_record.extend(labels.cpu().numpy())
            
            # print(embeddings.shape)
            # print(labels.shape)
            
            # print(len(embeddings_record))
            # print(len(labels_record))
            
    df['embeddings'] = embeddings_record
    df['labels'] = labels_record
    
    return df

In [None]:
train_embeddings_df = encodings_to_embeddings(train_loader, model)
val_embeddings_df = encodings_to_embeddings(val_loader, model)
test_embeddings_df = encodings_to_embeddings(test_loader, model)

# Write out embedding to csv

In [None]:
print(train_embeddings_df.head())
print(val_embeddings_df.head())
print(test_embeddings_df.head())

In [None]:
# Export the dataframes to csv files
train_embeddings_df.to_csv('dataset/train_embeddings.csv', sep='\t', index=False)
val_embeddings_df.to_csv('dataset/val_embeddings.csv', sep='\t', index=False)
test_embeddings_df.to_csv('dataset/test_embeddings.csv', sep='\t', index=False)

# Import dataset for future use

In [None]:
import ast

# read
sample = pd.read_csv('dataset/train_embeddings.csv', sep='\t', encoding='utf-8')
print(sample.head())

print(sample['embeddings'][0])   
print(type(sample['embeddings'][0])) # string

# Function to convert the string representation of the array to a list of floats
def convert_str_to_float_list(s):
    s = s.strip('[]')
    # Split the string on spaces, filter out any empty strings that result from consecutive spaces
    number_strings = filter(None, s.split(' '))
    # Convert each string to a float and return the list
    return [float(x) for x in number_strings]

# convert the embeddings to list of floats
sample['embeddings'] = sample['embeddings'].apply(convert_str_to_float_list)

print(sample['embeddings'][0])
print(type(sample['embeddings'][0])) # list of integers
