In [1]:
import os
import openai
from dotenv import load_dotenv
import os.path

from llama_index import (
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.text_splitter import SentenceSplitterpip


# openai_api_key = os.getenv("OPENAI_API_KEY")
# openai.api_key = openai_api_key

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

ImportError: cannot import name 'ServiceContext' from 'llama_index' (unknown location)

In [9]:
# smaller chunks might get more precise info while larger chunks get more context
# defaults are chunk_size=1024 and chunk_overlap=20
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# ServiceContext provides various customization options
service_context = ServiceContext.from_defaults(text_splitter=text_splitter)

# function to get the filename later used as metadata
filename_fn = lambda filename: {"file_name": filename}

# check if storage already exists
if not os.path.exists("./storage"):
    # load the documents and set the filename as metadata
    documents = SimpleDirectoryReader(
        "data",
        file_metadata=filename_fn,
        filename_as_id=True # path to doc as doc_id
    ).load_data()

    # creates the index, service context provides customizations for index building
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)

    # store it for later
    index.storage_context.persist()
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir="./storage")
    index = load_index_from_storage(storage_context)

In [8]:
import logging
import sys
# You can set the level to DEBUG for verbose output, or use level=logging.INFO for less.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [7]:
# more context with similarity_top_k
query_engine = index.as_query_engine(
    similarity_top_k=5,
    response_mode="tree_summarize",
    streaming=True,
    )

response = query_engine.query("How is Israel breaking international law?")
# chat bot instead of q&a
# response = query_engine.chat("How is Israel breaking international law?")
print(response)

Israel is breaking international law by building settlements in the West Bank, including in East Jerusalem. This violates the Hague Regulations, which prohibit the confiscation of private property, and the Fourth Geneva Convention, which prohibits the destruction of private or state property except where necessary for military operations. The settlements also involve the unlawful appropriation of property by an occupying power, which is considered "pillage" and is prohibited by international law. Additionally, transferring the occupying power's civilians into the occupied territory is prohibited. These actions constitute war crimes under the Rome Statute of the International Criminal Court. The expansion of settlements has also resulted in the reduction of land available to Palestinians for herding and agriculture, leading to increased dependency on humanitarian assistance. Settler violence and the destruction of Palestinian-owned crops and olive trees have further harmed the livelihoo

In [11]:
# File path where you want to save the text file
file_path = "israel int law violations.txt"

# Open the file in write mode ('w')
with open(file_path, 'w') as file:
    # Write the string to the file
    file.write(str(response))

## Cleaning Texts with regex

In [20]:
import re

def clean_text(text):
    # Remove text inside square brackets (footnotes, references, etc.)
    text = re.sub(r'\[.*?\]', '', text)

    # Replace single line breaks (\n) that are not followed by another line break with a space
    text = re.sub(r'\n(?!\n)', ' ', text)

    # Remove lines matching the specified patterns
    # text = re.sub(r'^This content downloaded from .*$', '', text, flags=re.MULTILINE)
    # text = re.sub(r'^All use subject to .*$', '', text, flags=re.MULTILINE)

    return text

# Directory path containing files to be cleaned
directory_path = './documents/'

# Iterate through all files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):  # Process only .txt files (change as needed)
        file_path = os.path.join(directory_path, filename)

        # Read text from the file
        with open(file_path, 'r') as file:
            text_from_file = file.read()

        # Clean the text using the function
        cleaned_text = clean_text(text_from_file)

        # Overwrite the original file with the cleaned content
        with open(file_path, 'w') as file:
            file.write(cleaned_text)

In [None]:
# Get Metadata from texts

In [None]:
from llama_index import SimpleDirectoryReader

def extract_metadata(file_path: str):
    """
    This function should extract the metadata from the content of the file.
    For the sake of this example, let's assume that the metadata is always at the beginning of the file in the format 'author: John Doe\ndate: 2022-01-01\nsource: Newspaper\nweblink: www.newspaper.com\nBegin!\n'.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    metadata = {}
    for line in lines:
        if line.strip() == 'Begin!':
            break
        key, value = line.strip().split(': ')
        metadata[key] = value

    return metadata

# Create the SimpleDirectoryReader instance with the metadata extraction function
reader = SimpleDirectoryReader("./documents", file_metadata=extract_metadata)

# Load the documents
documents = reader.load_data()