In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import json
import pandas as pd
import json
from tqdm import tqdm

In [2]:

# Load and prepare data
path_to_json = '/kaggle/input/financial-times/merged_output.json'
with open(path_to_json, 'r') as file:
    data = json.load(file)

# Extract DOCNO and TEXT
extracted_data = [
    {"DOCNO": entry["DOCNO"], "TEXT": entry["TEXT"]}
    for entry in data
]

In [7]:


# List of models to use
models = [
    'multi-qa-distilbert-dot-v1',
    'multi-qa-MiniLM-L6-dot-v1',
    'multi-qa-mpnet-base-cos-v1',
    'all-mpnet-base-v2',
    'all-distilroberta-v1',
    'all-MiniLM-L12-v2',
    'all-MiniLM-L6-v2',
    'multi-qa-distilbert-cos-v1',
    'multi-qa-MiniLM-L6-cos-v1',
    'multi-qa-mpnet-base-dot-v1',
    'distiluse-base-multilingual-cased-v1',
    'distiluse-base-multilingual-cased-v2',
]

# Create embeddings for each model
sentences = [entry["TEXT"] for entry in extracted_data]
doc_ids = [entry["DOCNO"] for entry in extracted_data]

for model_name in tqdm(models, desc="Processing models"):
    # Initialize model
    model = SentenceTransformer(f'sentence-transformers/{model_name}')
    
    # Generate embeddings
    embeddings = model.encode(sentences, show_progress_bar=True)
    
    # Create DataFrame with DOCNO as first column
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df.insert(0, 'DOCNO', doc_ids)
    
    # Save embeddings
    output_file = f'{model_name}_FT_embeddings.csv'
    embeddings_df.to_csv(output_file, index=False)
    print(f"Saved embeddings for {model_name}")
