# Generative AI with Haystack on Azure 
This notebook shows an example of how to integrate AI models with your existing data on Azure. 

### Steps: 
* Access pdf data from Azure storage
* Extract text and store in vector database (Opensearch)
* Show basic Haystack flow
* Prompt with open source AI model (LLM)
* Prompt with OpenAI (GPT-4) on Azure


## Imports

In [None]:
from tqdm.auto import tqdm
from pathlib import Path
import os
import tempfile
import boto3
# Azure imports
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azureml.fsspec import AzureMachineLearningFileSystem
from requests_aws4auth import AWS4Auth
# Haystack imports
from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore
from haystack.nodes import (
    QuestionGenerator,
    EmbeddingRetriever, 
    BM25Retriever, 
    FARMReader, 
    PDFToTextConverter, 
    PreProcessor, 
    PromptModel,
    PromptNode, 
    PromptTemplate, 
    AnswerParser
)
from haystack.pipelines import (
    QuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
    ExtractiveQAPipeline,
    Pipeline
)
from haystack.schema import Document
from haystack.utils import print_questions, print_answers

# Huggingface transformers library
from transformers import T5Tokenizer, T5ForConditionalGeneration

os.environ['TOKENIZERS_PARALLELISM'] = "false"

## Retrieve data from Azure storage 


In [None]:
# Authenticate to Azure workspace
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
ml_client = MLClient.from_config(credential=credential)

In [None]:
# Azure Machine Learning workspace details:
subscription = '<subscription-id>'
resource_group = '<resource_group>'
workspace = '<workspace_name>'
datastore_name = '<datastore_name>'

# long-form Datastore uri format:
uri = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}'

In [None]:
# instantiate file system using following URI
fs = AzureMachineLearningFileSystem(uri)

In [None]:
# View folder where data exists
fs.ls('upload')

In [None]:
# Create data asset for versioning and simple access by other services
VERSION = '1'
data_asset_name = '<DataAssetName>'
file_path = f'azureml://datastores/{datastore_name}/paths/upload/<file_name>'
# Define the Data asset object
pdf_data = Data(
    path=file_path,
    type=AssetTypes.URI_FILE,
    description="<description>",
    name=data_asset_name,
    version=VERSION,
)
# Create the data asset in the workspace
ml_client.data.create_or_update(pdf_data)

In [None]:
# Access data asset 
data_versions = []
data_assets = ml_client.data.list(name=data_asset_name)
for asset in data_assets:
    data_versions.append(asset.version)
latest_version = max(data_versions)
azure_ml_asset = ml_client.data.get(data_asset_name, version=latest_version)
data_path_parts = Path(azure_ml_asset.path.split('datastores/')[1]).parts
data_path = '/'.join(data_path_parts[2:])

## Setup Haystack

In [None]:
## Instantiate in-memory document store if not using OpenSearch. In production, you would use a persistent vector database such as OpenSearch
# document_store = InMemoryDocumentStore(use_bm25=True)

In [None]:
# Access AWS opensearch
region = '<region>'
service = 'es' 
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service)
host = "<open_search_host_url>"

In [None]:
# Start OpenSearch Document Store
document_store = OpenSearchDocumentStore(
    host = host,
    port = 443,
    aws4auth = awsauth,                                        
    scheme = "https",
    verify_certs = True,
    username = None,
    password = None,
    similarity = 'cosine'
)

In [None]:
# Converter for pdf to text
converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["en"]
)

In [None]:
# Convert pdf to text
with tempfile.TemporaryDirectory() as tempdir:
    fs.download(rpath=data_path, lpath=tempdir, recursive=False)
    docs = converter.convert(file_path=Path(Path(tempdir) / "<file_name>"), meta=None)

In [None]:
print(docs[0].content)

## Preprocess documents for efficient querying

This is a default usage of the PreProcessor.
Here, it performs cleaning of consecutive whitespaces
and splits a single large document into smaller documents.
https://docs.haystack.deepset.ai/docs/preprocessor

In [None]:
# Preprocess and chunk docs for more efficient retrieval
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
)
processed_docs = preprocessor.process(docs)
print(f"n_files_input: {len(docs)}\nn_docs_output: {len(processed_docs)}")

In [None]:
len(processed_docs)

In [None]:
processed_docs[0]

In [None]:
print(processed_docs[0].content)

## Write docs to document store

In [None]:
## To reset your DocumentStore

# document_store.delete_documents()

In [None]:
# load your documents into the DocumentStore
document_store.write_documents(processed_docs)

In [None]:
# Create embeddings for your documents
embedding_retriever = EmbeddingRetriever(document_store = document_store,
                               embedding_model="sentence-transformers/all-mpnet-base-v2")


In [None]:

document_store.update_embeddings(embedding_retriever)

## Q/A Pipeline (no Gen AI)

In [None]:
# Template of basic Haystack workflow 

# # Retriever: A Fast and simple algo to identify the most promising candidate documents
retriever = BM25Retriever(document_store)

# # Reader: Powerful but slower neural network trained for QA
model_name = "deepset/roberta-base-squad2"
reader = FARMReader(model_name)

# # Pipeline: Combines all the components
pipe = ExtractiveQAPipeline(reader, retriever)


In [None]:
# Question without prompt node (no LLM/Gen AI) and with sparse retrieval
question = "<query>"
prediction = pipe.run(query=question)
print_answers(prediction)

In [None]:
# Question using Embedding Retriever
pipe = ExtractiveQAPipeline(reader, embedding_retriever)
prediction = pipe.run(query=question)
print_answers(prediction)

## Generate Questions
Will take some time, especially with many documents in document store. Using GPU should speed up process

In [None]:
# Initialize Question Generator
question_generator = QuestionGenerator()

In [None]:
# Generate questions from each document
q_results = []
reader = FARMReader("deepset/roberta-base-squad2")
q_pipeline = QuestionGenerationPipeline(question_generator)
for idx, document in enumerate(tqdm(document_store)):
    print(f"\n * Generating questions for document {idx} ...\n")
    result = q_pipeline.run(documents=[document])
    q_results.append(result)

In [None]:
q_results

In [None]:
# Generate questions and answers from each document
qag_results = []
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
for idx, document in enumerate(tqdm(document_store)):
    print(f"\n * Generating questions and answers for document {idx}: {document.content}...\n")
    result = qag_pipeline.run(documents=[document])
    qag_results.append(result)


## Use LLM - Gen AI - text generation models from Huggingface
https://docs.haystack.deepset.ai/docs/prompt_node

In [None]:
# from haystack.nodes import PromptNode
# from transformers import AutoModelForCausalLM
# from transformers import AutoTokenizer

# model = AutoModelForCausalLM.from_pretrained(
#     'mosaicml/mpt-7b-instruct',
#     trust_remote_code=True
# )

# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")


### Test with Google Flan T5 (multi-task, text-to-text model) - HF

In [None]:
model_name = 'google/flan-t5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
query = '<query>'

In [None]:
# Create embeddings for your documents
embedding_retriever = EmbeddingRetriever(document_store = document_store,
                               embedding_model="sentence-transformers/all-mpnet-base-v2")


In [None]:
# Create a custom supported prompt using PromptTemplate
user_prompt = PromptTemplate(prompt="""Synthesize a comprehensive answer from the following topk most relevant paragraphs and the given question. 
                             Provide a clear and concise response that summarizes the key points and information presented in the paragraphs. 
                             Your answer should be in your own words and be no longer than 100 words. 
                             \n\n Paragraphs: {join(documents)} \n\n Question: {query} \n\n Answer:""",
                             output_parser=AnswerParser(),) 
## "mosaicml/mpt-7b-instruct" as a larger example
# Initiate the PromptNode 
prompt_node = PromptNode("google/flan-t5-small", model_kwargs={"model":model, "tokenizer": tokenizer}, default_prompt_template=user_prompt)

In [None]:
pipe = Pipeline()
pipe.add_node(component=embedding_retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [None]:
output = pipe.run(query=query, params={"retriever": {"top_k": 1}})
[a.answer for a in output["answers"]]

## Use OpenAI GPT-4

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
api_key = os.environ.get("AZURE_API_KEY")
deployment_name = os.environ.get("AZURE_DEPLOYMENT_NAME")
base_url = os.environ.get("AZURE_BASE_URL")

azure_prompt = PromptModel(
    model_name_or_path="gpt-4",
    api_key=api_key,
    model_kwargs={
        "api_version": "2023-07-01-preview",
        "azure_deployment_name": deployment_name,
        "azure_base_url": base_url,
        "max_tokens": 2000
    },
)
azure_prompt_node = PromptNode(azure_prompt, default_prompt_template=user_prompt)

In [None]:
pipe = Pipeline()
pipe.add_node(component=embedding_retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=azure_prompt_node, name="prompt_node", inputs=["retriever"])

In [None]:
output = pipe.run(query=query, params={"retriever": {"top_k": 5}})

In [None]:
[a.answer for a in output["answers"]][0]