In [1]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import clean_wiki_text
import pandas as pd
# import pdfplumber
# import docx
import os
import json

# Initialize FAISS document store
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# Helper function to read different file formats and handle nested folders
def load_documents(dir_path="documents/"):
    docs = []
    
    for root, _, files in os.walk(dir_path):  # Recursively traverse directories
        folder_context = os.path.basename(root)  # Capture folder name for context
        for filename in files:
            file_path = os.path.join(root, filename)
            ext = filename.split(".")[-1].lower()
            text = ""
            
            try:
                if ext == "pdf":
                    with pdfplumber.open(file_path) as pdf:
                        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
                        
                elif ext == "docx":
                    doc = docx.Document(file_path)
                    text = "\n".join(paragraph.text for paragraph in doc.paragraphs)
                    
                elif ext == "csv":
                    df = pd.read_csv(file_path)
                    text = df.to_string(index=False)
                    
                elif ext == "xlsx":
                    df = pd.read_excel(file_path)
                    text = df.to_string(index=False)
                    
                elif ext == "json":
                    with open(file_path, "r") as json_file:
                        json_data = json.load(json_file)
                        text = json.dumps(json_data, indent=4)
                        
                elif ext == "txt":
                    with open(file_path, "r") as text_file:
                        text = text_file.read()
                        
                # Clean and add document with folder context
                if text:
                    docs.append({
                        "content": clean_wiki_text(text),
                        "meta": {"name": filename, "folder_context": folder_context, "path": root}
                    })
                    
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return docs

# Convert documents and write them to the document store
docs = load_documents("documents/")
document_store.write_documents(docs)

# Load retriever model and tokenizer
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True
)

# Update embeddings to enable fast retrieval
document_store.update_embeddings(retriever)

# Load the pre-trained model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

# Initialize the FARMReader
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)

# Build QA pipeline
pipeline = ExtractiveQAPipeline(reader, retriever)

# Function to answer queries
def answer_query(query):
    prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}})
    answers = prediction["answers"]
    for answer in answers:
        folder_context = answer.meta.get("folder_context", "N/A")
        file_path = answer.meta.get("path", "Unknown path")
        print(f"Answer: {answer.answer}, Score: {answer.score}\nFolder Context: {folder_context}\nFile Path: {file_path}\n")
    return answers

# Query Example
query = "How many children's social care Overall effectiveness Required improvement?"
answers = answer_query(query)

for idx, answer in enumerate(answers):
    print(f"Answer {idx + 1}: {answer.answer}\nContext: {answer.context}\nFolder Context: {answer.meta.get('folder_context')}\nScore: {answer.score}\n")


  from .autonotebook import tqdm as notebook_tqdm
Writing Documents: 10000it [00:00, 476138.49it/s]       
Documents Processed: 10000 docs [00:00, 11289.06 docs/s]     
Inferencing Samples:   0%|          | 0/14 [00:00<?, ? Batches/s]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:   7%|▋         | 1/14 [00:06<01:26,  6.62s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  14%|█▍        | 2/14 [00:13<01:23,  6.97s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  21%|██▏       | 3/14 [00:21<01:18,  7.17s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  29%|██▊       | 4/14 [00:27<01:09,  6.91s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  36%|███▌      | 5/14 [00:34<01:01,  6

Answer: 63, Score: 0.9733178615570068
Folder Context: health
File Path: documents/health

Answer: 10014460, Score: 0.9580560922622681
Folder Context: health
File Path: documents/health

Answer: 10021739, Score: 0.9542986154556274
Folder Context: health
File Path: documents/health

Answer 1: 63
Context:      Private           5        Full inspection      04/08/2016     10015063      Requires improvement                                    NaN         
Folder Context: health
Score: 0.9733178615570068

Answer 2: 10014460
Context:         Private           6        Full inspection      19/05/2016     10014460      Requires improvement                                    NaN      
Folder Context: health
Score: 0.9580560922622681

Answer 3: 10021739
Context:         Private           4        Full inspection      07/09/2016     10021739      Requires improvement                                    NaN      
Folder Context: health
Score: 0.9542986154556274






In [2]:
query = "How many children's social care Overall effectiveness Required improvement? hint check Overall effectiveness column"
answers = answer_query(query)

Inferencing Samples:   0%|          | 0/14 [00:00<?, ? Batches/s]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:   7%|▋         | 1/14 [00:06<01:21,  6.26s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  14%|█▍        | 2/14 [00:12<01:13,  6.16s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  21%|██▏       | 3/14 [00:19<01:10,  6.43s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  29%|██▊       | 4/14 [00:26<01:06,  6.64s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  36%|███▌      | 5/14 [00:32<00:59,  6.61s/ Batches]'segment_ids' is not None, but DistilBert does not use them. They will be ignored.
Inferencing Samples:  43%|████▎     | 6/14 [00:39<00:53,  6.64s/ Batches

Answer: 10014110, Score: 0.942974328994751
Folder Context: health
File Path: documents/health

Answer: 10013541, Score: 0.9376291036605835
Folder Context: health
File Path: documents/health

Answer: 10014322, Score: 0.8968779444694519
Folder Context: health
File Path: documents/health






In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your custom training data
    eval_dataset=eval_dataset     # Your custom evaluation data
)

trainer.train()


from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class QueryRequest(BaseModel):
    question: str

@app.post("/query/")
def get_answer(request: QueryRequest):
    answers = answer_query(request.question)
    return {"answers": [answer.answer for answer in answers]}

# Run the app
# uvicorn app:app --reload


In [1]:
pip -V

pip 24.2 from c:\Users\haris\anaconda3\envs\new_env_name\lib\site-packages\pip (python 3.9)

Note: you may need to restart the kernel to use updated packages.
