In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, models

  from .autonotebook import tqdm as notebook_tqdm
2024-06-11 19:58:13.236374: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-11 19:58:13.982898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/regy/miniconda3/envs/tf/lib/
2024-06-11 19:58:13.983018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/regy/miniconda3/envs/tf/lib/


In [2]:
# Load the dataset
df = pd.read_csv('data_book_metadata.csv')

# Extract the sentences from the Metadata column
sentences = df['Metadata'].tolist()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize the sentences without truncation to get their lengths
token_lengths = [len(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in sentences]

# Calculate the average and maximum token lengths
average_token_length = np.mean(token_lengths)
max_token_length = np.max(token_lengths)

print(f'Average token length: {average_token_length}')
print(f'Maximum token length: {max_token_length}')

# Tokenize the sentences with truncation to max_length of 512
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Extract input_ids and attention_mask
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors


Average token length: 198.1050326073582
Maximum token length: 2061


In [3]:
# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, input_ids, attention_mask, targets):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.targets = targets

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'targets': self.targets[idx]
        }

In [4]:
# Generate random target embeddings for demonstration purposes (replace with actual targets)
target_embeddings = np.random.rand(len(sentences), 256).astype(np.float32)

# Split the data into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_ids, target_embeddings, test_size=0.1, random_state=42)
train_masks, val_masks = train_test_split(attention_mask, test_size=0.1, random_state=42)

# Create PyTorch datasets
train_dataset = TextDataset(train_inputs, train_masks, train_targets)
val_dataset = TextDataset(val_inputs, val_masks, val_targets)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [5]:
# Load the pre-trained model
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Define the fine-tuning model
class FineTuningModel(nn.Module):
    def __init__(self, base_model):
        super(FineTuningModel, self).__init__()
        self.base_model = base_model
        self.dense = nn.Linear(384, 256)  # 384 is the hidden size of all-MiniLM-L6-v2

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        dense_output = self.dense(cls_token)
        return dense_output

# Initialize the fine-tuning model
fine_tuned_model = FineTuningModel(model)

# Define optimizer and loss function
optimizer = optim.SGD(fine_tuned_model.parameters(), lr=1e-4, momentum=0.9)
criterion = nn.MSELoss()

In [6]:
# Training function
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

In [7]:
# Evaluation function
def evaluate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)

            total_loss += loss.item()
    return total_loss / len(dataloader)

In [8]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fine_tuned_model.to(device)

epochs = 2
best_val_loss = float('inf')
early_stopping_patience = 5
early_stopping_counter = 0

for epoch in range(epochs):
    train_loss = train_epoch(fine_tuned_model, train_dataloader, optimizer, criterion, device)
    val_loss = evaluate_epoch(fine_tuned_model, val_dataloader, criterion, device)
    print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(fine_tuned_model.state_dict(), 'fine_tuned_all_mini_lm_l6_v2.pt')
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print("Early stopping")
        break

Training: 100%|██████████| 9143/9143 [16:05<00:00,  9.47it/s]
Evaluating: 100%|██████████| 1016/1016 [00:34<00:00, 29.58it/s]


Epoch 1/2, Train Loss: 0.10871803536758529, Val Loss: 0.08430421510784644


Training: 100%|██████████| 9143/9143 [16:02<00:00,  9.50it/s]
Evaluating: 100%|██████████| 1016/1016 [00:34<00:00, 29.37it/s]


Epoch 2/2, Train Loss: 0.08619009844455992, Val Loss: 0.08381352628632559


In [9]:
torch.save(fine_tuned_model.state_dict(), 'fine_tuned_embedding.pt')

In [10]:
torch.save(fine_tuned_model, 'fine_tuned_embedding')

In [11]:
import json

# Function to convert state dictionary to a JSON-compatible format
def state_dict_to_json_compatible(state_dict):
    json_compatible_state_dict = {}
    for key, value in state_dict.items():
        if isinstance(value, torch.Tensor):
            json_compatible_state_dict[key] = value.tolist()
        else:
            json_compatible_state_dict[key] = value
    return json_compatible_state_dict

# Function to save the model's state dictionary to a JSON file
def save_model_to_json(model, file_path):
    state_dict = model.state_dict()
    json_compatible_state_dict = state_dict_to_json_compatible(state_dict)
    
    with open(file_path, 'w') as f:
        json.dump(json_compatible_state_dict, f)

# Save the model's state dictionary to a JSON file
save_model_to_json(fine_tuned_model, 'fine_tune_embedding.json')