In [1]:
import openai
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import numpy as np
import faiss

load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')
# Function to generate embeddings using the OpenAI API


def get_embedding(text, tags, type, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    combine = text + " " .join(tags) + " " .join(type)
    return openai.embeddings.create(input=[combine], model=model).data[0].embedding

# Function to load JSON data from a file
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Function to save JSON data to a file
def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

file_path = 'APP/data/KAR/KAR.json'
data = load_json(file_path)
file_path_new = 'APP/data/KAR/KAR_embedded.json'

# Assuming you have your data loaded and structured
for article in data:
    article_text = article['text']
    article_tags = article['metadata']['tags']
    article_type = article['metadata']['type']
    article['embedding'] = get_embedding(article_text, article_tags, article_type)

# Save the updated data with embeddings back to the JSON file
save_json(data, file_path_new)

# Extract embeddings from the articles
embeddings = np.array([article['embedding'] for article in data]).astype('float32')

# Create a base index - L2 distance
base_index = faiss.IndexFlatL2(embeddings.shape[1])

# Create an IndexIDMap
index = faiss.IndexIDMap(base_index)

# IDs for the articles
ids = np.array([i for i in range(len(data))], dtype='int64')  # Ensure IDs are int64

# Add vectors and their IDs to the index
index.add_with_ids(embeddings, ids)

# Save the index to disk
faiss.write_index(index, "APP/data/KAR/KAR.index")


# Extract embeddings from the articles
embeddings = np.array([article['embedding'] for article in data]).astype('float32')

# Create a base index - L2 distance
base_index = faiss.IndexFlatL2(embeddings.shape[1])

# Create an IndexIDMap
index = faiss.IndexIDMap(base_index)

# IDs for the articles
ids = np.array([i for i in range(len(data))], dtype='int64')  # Ensure IDs are int64

# Add vectors and their IDs to the index
index.add_with_ids(embeddings, ids)

# Save the index to disk
faiss.write_index(index, "APP/data/KAR/KAR.index")


In [15]:
import json

def append_attribute_to_all_entries(file_path, new_attribute_name, new_attribute_value):
    # Read the existing JSON data from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    
    # Check if json_data is a list
    if isinstance(json_data, list):
        for entry in json_data:
            # Check if 'metadata' exists and is a dictionary in each entry
            if 'metadata' in entry and isinstance(entry['metadata'], dict):
                # Append the new attribute to the 'metadata' dictionary
                entry['metadata'][new_attribute_name] = new_attribute_value
            else:
                print(f"Error: 'metadata' key not found or is not a dictionary in entry {entry}.")
                return
    else:
        print("Error: JSON data is not a list of entries.")
        return
    
    # Write the updated JSON data back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(json_data, file, indent=4, ensure_ascii=False)
    
    print(f"Appended attribute '{new_attribute_name}' with value '{new_attribute_value}' to the 'metadata' in all entries of the JSON file.")

# Example usage
file_path = 'Pausenreglement.json'  # Path to your JSON file
new_attribute_name = 'Gesetzestext'
new_attribute_value = 'PReg'

append_attribute_to_all_entries(file_path, new_attribute_name, new_attribute_value)


Appended attribute 'Gesetzestext' with value 'PReg' to the 'metadata' in all entries of the JSON file.
