In [5]:
import re

with open('../data/law.txt', 'r') as f:
    text = f.read()

patterns = [
    # Match any number with a newline before and after it
    (r'\n([0-9]+)\n', r' \1 ', None),
    
    # Replace all newlines with a single space
    (r'\n', ' ', None),
    
    # Remove special indexing characters but keep the apostrophes within words (like in "they're")
    (r"(?<!\w)'|'(?!\w)", ' ', None),
    
    # Remove any sequence like (see chapter n) where n can be any number and letters can be lower or upper case.
    (r'\(see chapter [0-9]+\)', '', re.IGNORECASE),
    
    # Remove any sequence like (Chapter n) where n can be any number and letters can be lower or upper case.
    (r'\(Chapter [0-9]+\)', '', re.IGNORECASE),
    
    # Remove all (Law 101)
    (r'\(Law 101\)', '', None)
]

for pattern, replacement, flags in patterns:
    text = re.sub(pattern, replacement, text, flags=flags if flags else 0)

# Only print out the first 1000 characters
print(text[:1000])

LegalEaseAI is an advanced web application that leverages artificial intelligence to assist users with various legal tasks. It is designed to simplify complex legal processes, save users time and effort, and provide legal advice, case analysis, summaries, and predictions. LegalEaseAI uses AI-powered algorithms to interpret legal terms and context, automate tasks such as analyzing legal documents and offering insights, and predict case outcomes. It aims to streamline legal procedures, make legal information accessible, and cater to a broad user base including law students, legal professionals, businesses, and individuals. LegalEaseAI is a comprehensive tool that utilizes AI to provide accurate legal assistance and simplify the legal landscape.  Can you introduce yourself?I'm LegalEaseAI, a web-based platform that uses AI to make your legal tasks easier. I provide legal advice, analyze and summarize legal cases, and predict possible outcomes based on the patterns I identify. My mission i

In [6]:
# Show the contents of a single document in the dataset
next(iter(text))

'L'

In [7]:
from haystack.document_stores import PineconeDocumentStore

document_store = PineconeDocumentStore(
    environment='us-west4-gcp-free',
    api_key='5041a6c5-3f45-403a-a9d3-597bab259a87',
    index='text',
    similarity="cosine",
    embedding_dim=768
)

In [8]:
document_store.get_document_count()

0

In [9]:
document_store.get_embedding_count()

0

In [10]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')
# Function to measure the length of a text
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Text splitter initialization
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,  
    chunk_overlap=15,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [12]:
from haystack import Document
from tqdm.auto import tqdm

# Split the large text into chunks
text_chunks = text_splitter.split_text(text)

docs = []
for idx, chunk in enumerate(tqdm(text_chunks)):
    # create haystack document object with text content
    doc = Document(
        content=chunk,
        meta={
            "chunk_id": idx,
        }
    )
    docs.append(doc)

# Write documents to the document store
document_store.write_documents(docs)


  0%|          | 0/2836 [00:00<?, ?it/s]

Writing Documents:   0%|          | 0/2836 [00:00<?, ?it/s]

In [13]:
document_store.get_document_count()

2836

In [14]:
document_store.get_embedding_count()

0

In [15]:
from sentence_transformers import SentenceTransformer

texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here',
    'and finally a third chunk of text'
]

embed = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to("cuda")
embeddings = embed.encode(texts)
len(embeddings), len(embeddings[0])

  return self.fget.__get__(instance, owner)()


(3, 384)

In [16]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
   document_store=document_store,
   embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
   model_format="sentence_transformers"
)

In [17]:
document_store.update_embeddings(
   retriever,
   batch_size=64
)

Updating Embedding:   0%|          | 0/2836 [00:00<?, ? docs/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# from haystack.pipelines import DocumentSearchPipeline
# from haystack.utils import print_documents

# search_pipe = DocumentSearchPipeline(retriever)
# result = search_pipe.run(
#     query="Can the goverment make me work?",
#     params={"Retriever": {"top_k": 2}}
# )

# print_documents(result)

NameError: name 'retriever' is not defined

In [None]:
from haystack.nodes import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

In [None]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator, retriever)

In [None]:
import random

result = pipe.run(
    query="whats legal ease ai",
    params={
        "Retriever": {"top_k": 3},
        "Generator": {"top_k": 1}
    })

max_score = max(result['answers'][0].meta['doc_scores'])

response_list = [
    "It seems this is beyond my knowledge, or I didn't understand your question. Could you please rephrase it?",
    "I'm afraid that's beyond my comprehension, or perhaps I misunderstood your question. Would you mind rephrasing it?",
    "It appears I'm either not equipped to answer this, or I didn't grasp your question. Could you reword it please?",
    "This might be outside my expertise, or maybe your question wasn't clear. Could you reformulate it?"
]

if max_score < 0.6:
    print(random.choice(response_list))
else:
    print(result['answers'][0].answer)


NameError: name 'pipe' is not defined