# Preprocess the dataset, make it word-embedding compatible

In [18]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split
import os as os
import re 
import csv

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import time

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit

In [4]:
path = "/mnt/c/Users/Johan/Documents/ITligence"  # Replace with your desired path
os.chdir(path)

In [20]:
def txt_to_csv(input_txt_file, output_csv_file):
    # Read the text file
    with open(input_txt_file, 'r') as txt_file:
        lines = txt_file.readlines()

    # Write to the CSV file
    with open(output_csv_file, 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        # Write the header
        csv_writer.writerow(['label', 'text'])

        for line in lines:
            line = line.strip()
            # Use regex to extract the label (number at the beginning) and the text
            match = re.match(r'^(\d+)\s+(.*)', line)
            if match:
                label, text = match.groups()
                csv_writer.writerow([label, text])

# Usage example
input_txt_file = "data/uncleaned_data/JerryWeiAIData/train_orig.txt"  # Replace with your input text file path
output_csv_file = "data/uncleaned_data/JerryWeiAIData/train_orig.csv"  # Replace with your desired output CSV file path

txt_to_csv(input_txt_file, output_csv_file)

In [22]:
JerryWeiData = pd.read_csv("data/uncleaned_data/JerryWeiAIData/train_orig.csv")

Pipeline to extract embeddings

In [32]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # Last hidden state of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to get embeddings
def get_embeddings(sentences, model, tokenizer, batch_size, embedding_type='sentence'):
    sentence_embeddings = []
    token_embeddings = []
    
    start_time = time.time()
    
    # Determine the maximum sequence length for padding
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    max_seq_length = encoded_input['input_ids'].shape[1]

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt', max_length=max_seq_length)
        
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        if embedding_type in ['sentence', 'both']:
            batch_sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
            sentence_embeddings.append(batch_sentence_embeddings)
        
        if embedding_type in ['token', 'both']:
            batch_token_embeddings = model_output.last_hidden_state
            token_embeddings.append(batch_token_embeddings)
    
    end_time = time.time()
    
    # Concatenate embeddings from different batches
    if embedding_type in ['sentence', 'both']:
        sentence_embeddings = torch.cat(sentence_embeddings, dim=0)
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    if embedding_type in ['token', 'both']:
        # Pad token embeddings to the maximum sequence length
        for i in range(len(token_embeddings)):
            pad_size = max_seq_length - token_embeddings[i].shape[1]
            token_embeddings[i] = F.pad(token_embeddings[i], (0, 0, 0, pad_size), 'constant', 0)
        token_embeddings = torch.cat(token_embeddings, dim=0)
    
    time_taken = abs(start_time - end_time)
    print(f"Generated Embeddings. Time Taken: {time_taken} Seconds.")
    
    if embedding_type == 'sentence':
        return sentence_embeddings
    elif embedding_type == 'token':
        return token_embeddings
    elif embedding_type == 'both':
        return sentence_embeddings, token_embeddings
    
# Usage
# Sentences we want embeddings for
sentences = list(JerryWeiData["text"][1:100])

# Load model and tokenizer from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

# Set batch size
batch_size = 8  # Adjust as needed

# Choose the type of embeddings: 'sentence', 'token', 'both'
embedding_type = 'both'

# Get embeddings
sentence, tokens = get_embeddings(sentences, model, tokenizer, batch_size, embedding_type)

Generated Embeddings. Time Taken: 0.8318467140197754 Seconds.


In [38]:
np_embedding = np.array(tokens)
#pd_embedding = pd.DataFrame(np_embedding)

In [39]:
#pd_embedding.to_csv("BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.csv")
np.save("npy_BiasData_UnmodifiedTokenEmbedding_paraphrase-MiniLM-L6-v2.npy", np_embedding)

In [42]:
np.save("npy_BiasData_1-100UnmodifiedSentenceEmbedding_paraphrase-MiniLM-L6-v2.npy", sentence)

# Load Data & Create Create Dataset

In [11]:
np_embedding = np.load("data/uncleaned_data/npy_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.npy")
csv_embedding = pd.read_csv("data/uncleaned_data/csv_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.csv")
JerryWeiData = pd.read_csv("data/uncleaned_data/JerryWeiAIData/train_orig.csv")
merged_embedding = pd.read_csv("data/cleaned_data/csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

In [7]:
merged_dataset = pd.concat([JerryWeiData, csv_embedding], axis=1)
merged_dataset = merged_dataset.drop("text", axis = 1)
merged_dataset = merged_dataset.drop("Unnamed: 0", axis = 1)
merged_dataset.to_csv("csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

NameError: name 'JerryWeiData' is not defined

# Reduce Dataset Size to 1000 points

In [15]:
# load embeddings & labels 
np_embedding = np.load("data/uncleaned_data/npy_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.npy")
labels_csv = pd.read_csv("data/cleaned_data/csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

In [17]:
labels_array = np.array(labels_csv["label"])
labels_array.shape

(253781,)

Create Reduced Dataset for Smooth Training

In [39]:
# Ensure the embeddings and labels have the same number of samples
assert np_embedding.shape[0] == labels_array.shape[0], "Embeddings and labels must have the same number of samples"

# Define the desired number of samples per class
desired_samples_per_class = 100

# Get unique classes
unique_classes = np.unique(labels_array)

# Initialize lists to hold the reduced embeddings and labels
reduced_embeddings_list = []
reduced_labels_list = []

# Iterate over each class to sample 100 examples
for cls in unique_classes:
    # Get the indices of all samples for the current class
    cls_indices = np.where(labels_array == cls)[0]
    cls_embeddings = np_embedding[cls_indices]
    cls_labels = labels_array[cls_indices]
    
    # Perform stratified sampling within the current class
    sss = StratifiedShuffleSplit(n_splits=1, train_size=desired_samples_per_class, random_state=42)
    train_index, _ = next(sss.split(cls_embeddings, cls_labels))
    
    # Select the reduced embeddings and labels based on the indices
    reduced_embeddings_list.append(cls_embeddings[train_index])
    reduced_labels_list.append(cls_labels[train_index])

# Concatenate the lists to form the final reduced arrays
reduced_embeddings = np.concatenate(reduced_embeddings_list)
reduced_labels = np.concatenate(reduced_labels_list)

# Ensure the reduced data has the expected number of samples (100 samples per class)
assert len(reduced_labels) == desired_samples_per_class * len(unique_classes)
assert all(np.bincount(reduced_labels) == desired_samples_per_class)

print(f"Reduced embeddings shape: {reduced_embeddings.shape}")
print(f"Reduced labels distribution: {np.bincount(reduced_labels)}")


Reduced embeddings shape: (1100, 384)
Reduced labels distribution: [100 100 100 100 100 100 100 100 100 100 100]


In [44]:
np.save("data/cleaned_data/npy_BiasData_100-per-class-SentenceEmbedding_paraphrase-MiniLM-L6-v2.npy", reduced_embeddings)
np.save("data/cleaned_data/npy_BiasData_100-per-class-labels_paraphrase-MiniLM-L6-v2.npy", reduced_labels)