# Preprocess the dataset, make it word-embedding compatible

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split
import os as os
import re 
import csv

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import time

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
path = "/mnt/c/Users/Johan/Documents/ITligence"  # Replace with your desired path
os.chdir(path)

In [20]:
def txt_to_csv(input_txt_file, output_csv_file):
    # Read the text file
    with open(input_txt_file, 'r') as txt_file:
        lines = txt_file.readlines()

    # Write to the CSV file
    with open(output_csv_file, 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        # Write the header
        csv_writer.writerow(['label', 'text'])

        for line in lines:
            line = line.strip()
            # Use regex to extract the label (number at the beginning) and the text
            match = re.match(r'^(\d+)\s+(.*)', line)
            if match:
                label, text = match.groups()
                csv_writer.writerow([label, text])

# Usage example
input_txt_file = "data/uncleaned_data/JerryWeiAIData/train_orig.txt"  # Replace with your input text file path
output_csv_file = "data/uncleaned_data/JerryWeiAIData/train_orig.csv"  # Replace with your desired output CSV file path

txt_to_csv(input_txt_file, output_csv_file)

In [3]:
JerryWeiData = pd.read_csv("data/uncleaned_data/JerryWeiAIData/train_orig.csv")

# Reduce Dataset size

In [7]:
labels_array = np.array(JerryWeiData["label"])

In [9]:
texts = JerryWeiData.drop("label", axis = 1)

First Approach, Extract only the most "important" sentences for each class, with the same amount of samples per class

In [43]:
JerryWeiData.columns

Index(['label', 'text'], dtype='object')

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = JerryWeiData

# Desired number of samples per class
desired_samples_per_class = 10000

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Initialize lists to hold the reduced sentences and labels
reduced_sentences = []
reduced_labels = []

# Process each class separately
for label in df['label'].unique():
    # Get all sentences and labels for the current class
    class_sentences = df[df['label'] == label]['text']
    
    # Fit and transform the sentences
    tfidf_matrix = vectorizer.fit_transform(class_sentences)
    
    # Sum the TF-IDF values for each sentence
    sentence_scores = tfidf_matrix.sum(axis=1).A1
    
    # Create a temporary DataFrame to hold sentences and their scores
    temp_df = pd.DataFrame({'text': class_sentences, 'score': sentence_scores})
    
    # Sort the sentences by their scores
    temp_df = temp_df.sort_values(by='score', ascending=False)
    
    # Select the top N sentences for the current class
    top_sentences = temp_df['text'].head(desired_samples_per_class).tolist()
    
    # Append the top sentences and their labels to the reduced lists
    reduced_sentences.extend(top_sentences)
    reduced_labels.extend([label] * len(top_sentences))

# Create the final reduced DataFrame
reduced_df = pd.DataFrame({'text': reduced_sentences, 'label': reduced_labels})

print(f"Reduced dataset shape: {reduced_df.shape}")
print(f"Reduced labels distribution:\n{reduced_df['label'].value_counts()}")


Reduced dataset shape: (110000, 2)
Reduced labels distribution:
0     10000
1     10000
2     10000
3     10000
4     10000
5     10000
6     10000
7     10000
8     10000
9     10000
10    10000
Name: label, dtype: int64


Second Approach (not desirable)

In [15]:
import numpy as np
import pandas as pd

# Ensure that texts is a numpy array for consistency with the rest of the code
if isinstance(texts, pd.DataFrame):
    texts = texts.values

# Define the desired number of samples per class
desired_samples_per_class = 1000

# Get unique classes
unique_classes = np.unique(labels_array)

# Initialize lists to hold the reduced embeddings and labels
reduced_embeddings_list = []
reduced_labels_list = []

# Iterate over each class to sample the desired number of examples
for cls in unique_classes:
    # Get the indices of all samples for the current class
    cls_indices = np.where(labels_array == cls)[0]
    cls_embeddings = texts[cls_indices]
    cls_labels = labels_array[cls_indices]
    
    # Check if there are enough samples in the class
    if len(cls_indices) >= desired_samples_per_class:
        sampled_indices = np.random.choice(cls_indices, desired_samples_per_class, replace=False)
    else:
        sampled_indices = np.random.choice(cls_indices, desired_samples_per_class, replace=True)
    
    # Select the reduced embeddings and labels based on the sampled indices
    reduced_embeddings_list.append(texts[sampled_indices])
    reduced_labels_list.append(labels_array[sampled_indices])

# Concatenate the lists to form the final reduced arrays
reduced_embeddings = np.concatenate(reduced_embeddings_list)
reduced_labels = np.concatenate(reduced_labels_list)

# Ensure the reduced data has the expected number of samples (1000 samples per class)
assert len(reduced_labels) == desired_samples_per_class * len(unique_classes)
assert all(np.bincount(reduced_labels) == desired_samples_per_class)

print(f"Reduced embeddings shape: {reduced_embeddings.shape}")
print(f"Reduced labels distribution: {np.bincount(reduced_labels)}")


Reduced embeddings shape: (11000, 1)
Reduced labels distribution: [1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000]


In [19]:
np.save("data/cleaned_data/JerrYWeiData_texts_reduced_1000_samples.npy", reduced_embeddings)
np.save("data/cleaned_data/JerrYWeiData_labels_reduced_1000_samples.npy", reduced_labels)

Pipeline to extract embeddings

In [6]:
import torch
import torch.nn.functional as F
import time
from transformers import AutoTokenizer, AutoModel

# Check if GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # Last hidden state of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to get embeddings
def get_embeddings(sentences, model, tokenizer, batch_size, embedding_type='sentence'):
    sentence_embeddings = []
    token_embeddings = []
    
    start_time = time.time()
    
    # Determine the maximum sequence length for padding
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    max_seq_length = encoded_input['input_ids'].shape[1]

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt', max_length=max_seq_length)
        
        # Move the encoded inputs to the GPU
        encoded_input = {key: val.to(device) for key, val in encoded_input.items()}
        
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        if embedding_type in ['sentence', 'both']:
            batch_sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
            sentence_embeddings.append(batch_sentence_embeddings.cpu())  # Move to CPU to save memory
        
        if embedding_type in ['token', 'both']:
            batch_token_embeddings = model_output.last_hidden_state
            token_embeddings.append(batch_token_embeddings.cpu())  # Move to CPU to save memory
    
    end_time = time.time()
    
    # Concatenate embeddings from different batches
    if embedding_type in ['sentence', 'both']:
        sentence_embeddings = torch.cat(sentence_embeddings, dim=0)
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    if embedding_type in ['token', 'both']:
        # Pad token embeddings to the maximum sequence length
        for i in range(len(token_embeddings)):
            pad_size = max_seq_length - token_embeddings[i].shape[1]
            token_embeddings[i] = F.pad(token_embeddings[i], (0, 0, 0, pad_size), 'constant', 0)
        token_embeddings = torch.cat(token_embeddings, dim=0)
    
    time_taken = abs(start_time - end_time)
    print(f"Generated Embeddings. Time Taken: {time_taken} Seconds.")
    
    if embedding_type == 'sentence':
        return sentence_embeddings, "empty"
    elif embedding_type == 'token':
        return "empty", token_embeddings
    elif embedding_type == 'both':
        return sentence_embeddings, token_embeddings

# Usage
# Sentences we want embeddings for
sentences = reduced_df["text"].to_list()

#reduced_embeddings.flatten().tolist()

# Load model and tokenizer from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Move model to the GPU
model.to(device)

# Set batch size
batch_size = 8  # Adjust as needed

# Choose the type of embeddings: 'sentence', 'token', 'both'
embedding_type = 'sentence'

# Get embeddings
sentence_embeddings, token_embeddings = get_embeddings(sentences, model, tokenizer, batch_size, embedding_type)


Using device: cuda
Generated Embeddings. Time Taken: 551.7480616569519 Seconds.


In [38]:
#np_embedding = np.array(tokens)
#pd_embedding = pd.DataFrame(np_embedding)

In [10]:
np.save("data/cleaned_data/npy_BiasData_10000_sentence_embedding_BERT.npy", sentence_embeddings)

np.save("data/cleaned_data/npy_BiasData_10000_labels_BERT.npy", np.array(reduced_df['label']))

#np.save("npy_BiasData_all_token_embedding_BERT", tokens)

Perform KNN on Embeddings

In [42]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
num_clusters = round(reduced_embeddings.shape[0] / 2)  # original size divided by two

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(reduced_embeddings)

# Get cluster centers (centroids) as representative embeddings
cluster_centers = kmeans.cluster_centers_

# Compute distances from each cluster center to all original embeddings
distances = cdist(cluster_centers, reduced_embeddings)

# Find the index of the closest original embedding for each cluster center
closest_indices = np.argmin(distances, axis=1)

# Assign the label of the closest original embedding to each cluster center
cluster_center_labels = reduced_labels[closest_indices]

# Output the reduced embeddings and their corresponding labels
reduced_embeddings = cluster_centers
reduced_labels = cluster_center_labels

# Load Data & Create Create Dataset

In [11]:
np_embedding = np.load("data/uncleaned_data/npy_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.npy")
csv_embedding = pd.read_csv("data/uncleaned_data/csv_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.csv")
JerryWeiData = pd.read_csv("data/uncleaned_data/JerryWeiAIData/train_orig.csv")
merged_embedding = pd.read_csv("data/cleaned_data/csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

In [7]:
merged_dataset = pd.concat([JerryWeiData, csv_embedding], axis=1)
merged_dataset = merged_dataset.drop("text", axis = 1)
merged_dataset = merged_dataset.drop("Unnamed: 0", axis = 1)
merged_dataset.to_csv("csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

NameError: name 'JerryWeiData' is not defined

# Reduce Dataset Size to 1000 points

In [15]:
# load embeddings & labels 
np_embedding = np.load("data/uncleaned_data/npy_BiasData_UnmodifiedEmbedding_paraphrase-MiniLM-L6-v2.npy")
labels_csv = pd.read_csv("data/cleaned_data/csv_MergedDataset_paraphrase-MiniLM-L6-v2.csv")

In [17]:
labels_array = np.array(labels_csv["label"])
labels_array.shape

(253781,)

Create Reduced Dataset for Smooth Training

In [47]:
# Ensure the embeddings and labels have the same number of samples
assert np_embedding.shape[0] == labels_array.shape[0], "Embeddings and labels must have the same number of samples"

# Define the desired number of samples per class
desired_samples_per_class = 5000

# Get unique classes
unique_classes = np.unique(labels_array)

# Initialize lists to hold the reduced embeddings and labels
reduced_embeddings_list = []
reduced_labels_list = []

# Iterate over each class to sample 100 examples
for cls in unique_classes:
    # Get the indices of all samples for the current class
    cls_indices = np.where(labels_array == cls)[0]
    cls_embeddings = np_embedding[cls_indices]
    cls_labels = labels_array[cls_indices]
    
    # Perform stratified sampling within the current class
    sss = StratifiedShuffleSplit(n_splits=1, train_size=desired_samples_per_class, random_state=42)
    train_index, _ = next(sss.split(cls_embeddings, cls_labels))
    
    # Select the reduced embeddings and labels based on the indices
    reduced_embeddings_list.append(cls_embeddings[train_index])
    reduced_labels_list.append(cls_labels[train_index])

# Concatenate the lists to form the final reduced arrays
reduced_embeddings = np.concatenate(reduced_embeddings_list)
reduced_labels = np.concatenate(reduced_labels_list)

# Ensure the reduced data has the expected number of samples (100 samples per class)
assert len(reduced_labels) == desired_samples_per_class * len(unique_classes)
assert all(np.bincount(reduced_labels) == desired_samples_per_class)

print(f"Reduced embeddings shape: {reduced_embeddings.shape}")
print(f"Reduced labels distribution: {np.bincount(reduced_labels)}")


Reduced embeddings shape: (55000, 384)
Reduced labels distribution: [5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000]


In [48]:
np.save("data/cleaned_data/npy_BiasData_5000-per-class-SentenceEmbedding_paraphrase-MiniLM-L6-v2.npy", reduced_embeddings)
np.save("data/cleaned_data/npy_BiasData_5000-per-class-labels_paraphrase-MiniLM-L6-v2.npy", reduced_labels)