# Comparer le score self check sur le map_reduce et le refine

# Prelude

In [17]:
import os
OPENAI_API_BASE= os.environ.get("OPENAI_API_BASE", "https://api-ai.numerique-interieur.com/v1") 
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import WebBaseLoader
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate


llm = ChatOpenAI(api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, temperature=0, model="mixtral")

from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
#from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain


In [2]:
[[1, 2, 3], [3, 4]]==[[1, 2, 3], [5, 4]]

False

# code

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_core.prompts import PromptTemplate

template_selfcheck = """Is the sentence true, according to the context provided below? Answer only by Yes or No, without justification
####
CONTEXT: {context}

#####
SENTENCE: {text}

Answer:"""

custom_rag_prompt_selfcheck = PromptTemplate.from_template(template_selfcheck)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_selfcheck = (
    {"context": retriever | format_docs, "text": RunnablePassthrough()}
    | custom_rag_prompt_selfcheck
    | llm
    | StrOutputParser()
)

In [15]:
from dataclasses import dataclass
from functools import cached_property
import logging
@dataclass
class SelfCheckGPT():
    llm: "Model"
    documents: list
    @staticmethod
    def parse_response(response:str) -> int:
        response = response.strip().lower()
        if response.startswith("yes"):
            rep_int = 1
        elif response.startswith("no"):
            rep_int = 0
        else:
            rep_int = 0.5
            logging.warning(f"{response}")
        return rep_int

    @staticmethod
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    def __post_init__(self):
        embeddings = HuggingFaceEmbeddings(model_name="OrdalieTech/Solon-embeddings-base-0.1") # plus de 13 min
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50, keep_separator=True, length_function=len)
        splits = text_splitter.split_documents(self.documents)
        vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
        self.retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})


    @cached_property
    def rag_prompt(self):
        template_selfcheck = ("""Is the sentence true, according to the context provided below? Answer only by Yes or No, without justification
####
CONTEXT: {context}

#####
SENTENCE: {text}

Answer:""")
        return PromptTemplate.from_template(template_selfcheck)

    @cached_property
    def rag_chain_selfcheck(self):
        return (
    {"context": self.retriever | SelfCheckGPT.format_docs, "text": RunnablePassthrough()}
    | self.rag_prompt
    | self.llm
    | StrOutputParser())


    def get_array(self, text:str) -> list[int]:
        import nltk
        nltk.download('punkt')
        sentences = nltk.tokenize.sent_tokenize(text)
        responses = [self.rag_chain_selfcheck.invoke(sentence).strip().lower() for sentence in sentences]
        return [SelfCheckGPT.parse_response(r) for r in responses]
    
    def eval_text(self, text:str) -> float:
        return statistics.mean(self.get_array(text=text))

In [2]:
import statistics
statistics.mean([1, 2.4, 3, 4, 5])

3.08

# Benchmark map_reduce vs refine

In [18]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [7]:
from urllib.parse import urlparse, urlsplit
parsed_url = urlparse("httzaps:/ez/www.exaezample.com/path?query=value#fragment")
parsed_url.scheme and parsed_url.netloc

''

In [19]:
def summarize_text(text, chain_type="map_reduce"):
    # Split the text into chunks
    text_splitter = CharacterTextSplitter()
    texts = text_splitter.split_text(text)
    
    # Create document objects for each chunk
    docs = [Document(page_content=t) for t in texts]
    
    # Initialize the OpenAI model and load the summarize chain
    chain = load_summarize_chain(llm, chain_type=chain_type)
    
    # Generate the summary
    summary = chain.run(docs)
    
    return summary


In [20]:
from pathlib import Path
text = Path("camus.txt").read_text()
map_reduce = summarize_text(text, chain_type="map_reduce")
refine = summarize_text(text, chain_type="refine")



In [21]:
map_reduce

" In 1957, Albert Camus accepted the Nobel Prize in Literature, expressing gratitude and humility. He discussed the writer's role as a witness to the human condition, particularly during conflicts and oppression, highlighting the significance of truth and freedom. Camus cautioned writers against becoming preachers or moralizers, acknowledging his own limitations and dedicating the prize to less recognized peers. He reaffirmed his commitment to artistic integrity and truth-telling."

In [22]:
refine

' Albert Camus, accepting the Nobel Prize in Literature in 1957, expresses his gratitude and humility for the honor. He reflects on the role of the writer as a witness to the human condition, particularly during times of conflict and oppression. He emphasizes the importance of truth and freedom, and encourages writers to resist the temptation to become preachers or moralizers. Camus acknowledges his own limitations and debts, dedicating the prize to those who share in the same struggle but have not received similar recognition. He concludes by reaffirming his commitment to artistic integrity and truth-telling.'

In [23]:
i = SelfCheckGPT(llm, documents=[Document(page_content=text)])
i.get_array(refine)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1, 1, 1, 1, -1]

In [24]:
i.get_array(map_reduce)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1, 1, -1, -1]

In [1]:
# poetry add datasets
from datasets import load_dataset

dataset = load_dataset("ZhongshengWang/Alpaca-cnn-dailymail")

Downloading readme:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/574226 [00:00<?, ? examples/s]

: 

In [1]:
url = "https://www.jesuismort.com/tombe/albert-camus#biographie"
from langchain_community.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=[url])

ImportError: unstructured package not found, please install it with `pip install unstructured`