In [1]:
# Move to your project folder (where the doc file is present)
%cd /content/drive/MyDrive/job_tasks/ds_task

/content/drive/MyDrive/job_tasks/ds_task


# Installing the necessary dependencies

In [3]:
!pip install -qU \
    openai==0.27.7 \
    "pinecone-client[grpc]"==2.2.1 \
    pinecone-datasets=='0.5.0rc11' \
    tqdm

In [None]:
!pip install sentence_transformers python-docx transformers

# Reading data from doc file and creating chunks of data

In [7]:
# Function to read text from a DOCX file

from docx import Document

def read_docx(file_path):
    doc = Document(file_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "\n"
    return full_text

In [8]:
#Function to split long documents in to smaller parts
def split_text_into_chunks(plain_text, max_chars=2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [9]:
# Define the path to your DOCX file
file_path = 'DataLaw.docx'

# Read text from the DOCX file
doc_text = read_docx(file_path)

# Split the text into chunks using the provided function
chunks = split_text_into_chunks(doc_text)

# Print the chunks or perform further operations
for i, chunk in enumerate(chunks, start=1):
    print(f"Chunk {i}: {chunk}\n")

Chunk 1: Disclaimer: All of the translations contained on this website are unofficial. Only the original Slovene texts of the laws and regulations have legal effect, and the translations are to be used solely as reference materials to aid in the understanding of Slovene laws and regulations. The Government of the Republic of Slovenia is not responsible for the accuracy, reliability or currency of the translations provided on this website, or for any consequence resulting from the use of information on this website. For all purposes of interpreting and applying law to any legal issue or dispute, users should consult the original Slovene texts published in the Official Gazette of the Republic of Slovenia. The unofficial consolidated version of the Agricultural Land Act comprises: -         Agricultural Land Act – ZKZ (Official Gazette of the Republic of Slovenia [Uradni list RS], No. 59/96 of 25 October 1996), -         Decision abrogating the provision of paragraph two of Article 124 of

In [10]:
chunks

['Disclaimer: All of the translations contained on this website are unofficial. Only the original Slovene texts of the laws and regulations have legal effect, and the translations are to be used solely as reference materials to aid in the understanding of Slovene laws and regulations. The Government of the Republic of Slovenia is not responsible for the accuracy, reliability or currency of the translations provided on this website, or for any consequence resulting from the use of information on this website. For all purposes of interpreting and applying law to any legal issue or dispute, users should consult the original Slovene texts published in the Official Gazette of the Republic of Slovenia. The unofficial consolidated version of the Agricultural Land Act comprises: -         Agricultural Land Act – ZKZ (Official Gazette of the Republic of Slovenia [Uradni list RS], No. 59/96 of 25 October 1996), -         Decision abrogating the provision of paragraph two of Article 124 of the Ag

# Preparing Pinecone index and OpenAI model `text-embedding-ada-002`

In [11]:
import os
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = "fcea53ce-e1e0-4ae0-a7a0-08600a47a8a0"
# find your environment next to the api key in pinecone console
env = "gcp-starter"

pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

  from tqdm.autonotebook import tqdm


WhoAmIResponse(username=None, user_label=None, projectname='r2dzhjm')

In [12]:
index_name = 'index1'

In [13]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='cosine',
    )
# connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00116,
 'namespaces': {'': {'vector_count': 116}},
 'total_vector_count': 116}

In [14]:
import openai

# get api key from platform.openai.com
openai.api_key = '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'

embed_model = "text-embedding-ada-002"

# Defining the function to load the data to the index

In [15]:
def addData(chunks, index):
    total_vector_count = index.describe_index_stats()['total_vector_count']
    for i, chunk in enumerate(chunks):
        # Assuming 'chunk' is a raw text string

        # Generate placeholder embeddings with dimensionality 1536
        # Replace this with your actual method to generate embeddings
        chunk_embedding = [0.1] * 1536  # Placeholder - replace with actual embeddings

        chunk_info = (str(total_vector_count + i), chunk_embedding, {'context': chunk})
        index.upsert(vectors=[chunk_info])

In [16]:
addData(chunks, index)

# Defining the functions

In [17]:
def find_match(query):
    res = openai.Embedding.create(input=[query], engine=embed_model)
    xq = res['data'][0]['embedding']
    result = index.query(xq, top_k=3, include_metadata=True)
    contexts = [x['metadata']['context'] for x in result['matches']]
    return contexts

In [18]:
def create_prompt(contexts, query):
    limit = 3750
    prompt_start = "Answer the question based on the context below.\n\nContext:\n"
    prompt_end = f"\n\nQuestion: {query}\nAnswer:"

    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start + "\n\n---\n\n".join(contexts[:i-1]) + prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start + "\n\n---\n\n".join(contexts) + prompt_end
            )
    return prompt

In [19]:
def generate_answer(prompt):
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

In [24]:
def user_query(query):
    matched_contexts = find_match(query)
    prompt = create_prompt(matched_contexts, query)
    generated_answer = generate_answer(prompt)
    print(matched_contexts)
    return generated_answer

# Example

In [26]:
# Example usage:
query = "can i build a monument in an agricultural land"
result = user_query(query)
print("Generated Answer:")
print(result)

['This Act regulates the protection and management of agricultural land by laying down its classification, use and cultivation, agricultural land transactions and lease arrangements, agricultural operations and common pasture. The provisions of this Act shall also apply mutatis mutandis to forests unless otherwise provided by an Act. Article 1a The goals of this Act shall be the following: -         to preserve and improve production potential and increase agricultural land area intended for food production; -         to foster the sustainable management of fertile soil; -         to foster landscape preservation and preserve and develop rural areas. Article 1b In order to achieve the goals referred to in the preceding Article, the Government of the Republic of Slovenia (hereinafter: Government) shall adopt agricultural land policy measures. The subject of agricultural land policy measures shall be agricultural land as referred to in paragraph two of Article 2 of his Act. The funds for

# Suggestions for improvement
1. ***Fine-tuning Semantic Search Models:***

    Fine-tune the text-embedding-ada-002 model on domain-specific data for better contextual understanding and improved matching.

2. ***Optimizing Query Embeddings:***

    Explore different methods for generating query embeddings without tokenizers to enhance the relevance of retrieved contexts.

3. ***Handling Large Text Chunks:***

    Implement chunking or summarization techniques for large text inputs to improve the matching process.

4. ***Efficiency in Prompt Generation:***

    Optimize the prompt creation process to ensure all relevant contexts are included within the specified limit without losing crucial information.
5. ***Combining Multiple Models:***

    Experiment with the integration of multiple models to enhance the quality and diversity of generated answers.

# Alternative approach

1. ***Preprocessing:***

- Text Cleaning: Remove noise, formatting, and irrelevant information from the input texts.
- Entity Extraction: Identify and extract relevant entities or keywords from both queries and contexts.

2. ***Semantic Matching:***

- Use a dedicated semantic matching model (BERT, RoBERTa) to perform similarity checks between the query and context embeddings.

3. ***Result Fusion:***

- Combine outputs from different models (e.g., text-embedding-ada-002, BERT) to create a diverse set of matched contexts.
- Implement a fusion mechanism to aggregate results from multiple models and prioritize relevant contexts.

4. ***Answer Generation:***

- Utilize a diverse set of language models (e.g., GPT-3, T5) to generate answers based on the fused contexts.

5. ***Feedback Mechanism:***

- Implement a feedback loop to refine the system based on user interactions or feedback.

This alternative approach aims to leverage preprocessing for better data refinement, diverse model integration for improved matching, and fusion techniques to enhance the quality of generated answers.