In [None]:
!pip install faiss-gpu
!pip install transformers torch faiss
!pip install blobfile tiktoken

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import faiss
import json
import pandas as pd
from tqdm import tqdm

# Load the model

In [None]:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Load model and tokenizer from Kaggle input directory
#model_path = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"  # Replace with your actual dataset name and model directory
model_path = 'meta-llama/Llama-3.2-1b-instruct'

access_token = ""  # Replace with your actual token

# Load the tokenizer and model with legacy=False
tokenizer = AutoTokenizer.from_pretrained(model_path, torch_dtype=torch.float16, use_auth_token=access_token)
# Set padding token to be the same as EOS token
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float16, use_auth_token=access_token)
model.to(device)  # Move the model to GPU

In [None]:
import json

path_to_json = '/kaggle/input/financial-times/merged_output.json'
# Load your JSON data from a file
with open(path_to_json, 'r') as file:
    data = json.load(file)

# Extract DOCNO and TEXT into a new list of dictionaries
extracted_data = [
    {
        "DOCNO": entry["DOCNO"],
        "TEXT": entry["TEXT"]
    }
    for entry in data
]

## Generate embeddings

In [None]:
# Encode code source: https://huggingface.co/castorini/repllama-v1-7b-lora-doc ()
# Encode documents into dense vectors
def encode_documents(document):
    inputs = tokenizer(document, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        # Get model outputs for the single document
        outputs = model(**inputs)
        # Extract last hidden states
        embeddings = outputs.last_hidden_state[0][-1]

        #embeddings /= embeddings.norm()  # Normalize
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=0)

    torch.cuda.empty_cache()  # Clear cache
    return embeddings.cpu().numpy()


# Extract texts and document IDs for encoding
texts = [entry["TEXT"] for entry in extracted_data]
doc_ids = [entry["DOCNO"] for entry in extracted_data]


# Prepare to write results to CSV immediately after processing each batch
output_file_path = './full_llama_document_embeddings.csv'

for i in tqdm(range(0, len(texts))):
    # Encode the current batch of documents
    batch_embeddings = encode_documents(texts[i])

    # Create a DataFrame for the current batch
    df_batch = pd.DataFrame(batch_embeddings.reshape(1, -1), index=[doc_ids[i]])

    # Append to CSV file (header=True only on first write)
    df_batch.to_csv(output_file_path, mode='a', header=not i)  # Append mode; write header only for the first batch

print(f"Document embeddings have been saved to {output_file_path}.")