# RAG application using open-source models
Code adapted from https://github.com/svpino/llm

In [1]:
!pip install pypdf
!pip install langchain_community
!pip install langchain[docarray]
!pip  install pydantic==1.10.9
!pip install langchain
!pip install pandas
!pip install chromadb
!pip install -qU langchain-text-splitters
!pip install rank_bm25



: 

In [1]:
import os
import pandas as pd
#from dotenv import load_dotenv

#load_dotenv()

#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#MODEL = "gpt-3.5-turbo"
#MODEL = "mixtral:8x7b"
MODEL = "llama3:8b"
MODEL = "llama3:8b-instruct-q4_0"

REPORTING_YEAR = "2023"
COMPANY_NAME = "Kering"
COMPANY_DESCRIPTION = ""


# Load the questions file

In [3]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

#from langchain_openai.embeddings import OpenAIEmbeddings
#from langchain_openai.chat_models import ChatOpenAI

if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL, show_progress=True, num_gpu=1)

#model.invoke("Tell me a joke")

In [None]:
#model.invoke("Raconte-moi une blague")

In [4]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser 
#chain.invoke("Tell me a joke")

In [5]:
from langchain.prompts import PromptTemplate

template = """
You are an expert in ecology and environmental issues. Your job is to examinate environmental claims of {COMPANY_NAME} based on their financial and sustainability report for the year {REPORTING_YEAR}. You will always reply in French. Answer the question based on the context below and following the given answer format. Always add to your answer actual quotes of the full sentences you used as a source. If you can't answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question", COMPANY_NAME=COMPANY_NAME, REPORTING_YEAR=REPORTING_YEAR)

'\nYou are an expert in ecology and environmental issues. Your job is to examinate environmental claims of Kering based on their financial and sustainability report for the year 2023. You will always reply in French. Answer the question based on the context below and following the given answer format. Always add to your answer actual quotes of the full sentences you used as a source. If you can\'t answer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

# Zero-shot Frame elements extraction

In [138]:
# Lisez le fichier Excel avec les réponses
df_frames = pd.read_excel("df_frames_sent.xlsx")

# Afficher le df.info
df_frames.info()

# Output folder to be used as input of answers from the RAG model
output_folder = "output"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  20 non-null     object
 1   Question            20 non-null     object
 2   RSEframe            20 non-null     object
 3   FrameNet            20 non-null     object
 4   Definition          20 non-null     object
 5   CoreFE              20 non-null     object
 6   NonCoreFE           20 non-null     object
 7   MultiQuestion       20 non-null     bool  
 8   QuestionsMultiples  10 non-null     object
 9   AnswerTemplate      20 non-null     object
 10  AnswExamples        20 non-null     object
 11  Relevantinfo        8 non-null      object
 12  UnknownExample      20 non-null     object
 13  Example_1_Kering    3 non-null      object
 14  Example_2_Kering    3 non-null      object
 15  Example_3_Kering    3 non-null      object
 16  Commentaires        3 non-nu

In [155]:
# Create a new Ollama chain to extract the frame elements and trigger words from each sentence in df_frames
def model_frames(sent, frame_description, frame_name, frame_elements):
    

    template_base_french = """"
    Vous êtes un expert en linguistique, rôles sémantiques et sémantique de cadres (cadre sémantique = semantic frame, a kind of event). Un élément de cadre (Frame element) est un rôle sémantique, par example: Agent, Récipient, Instrument, Lieu, Cause. Votre travail comprend 3 tâches à faire sur la phrase "{sentence}": 1, extraire les """ + f"""éléments de l'action "{frame_name}" (description = "{frame_description}") qui sont présents dans la phrase seulement parmi les catégories d'éléments suivantes "{frame_elements}"; 2, indiquez la position de début (start_index = starting index in the Sentence string) et de fin (end_index = ending index in the Sentence string) de chaque élément comme si c'était une entité nommée spaCy (['Lieu', 21, 33]) ; 3, extrayez le mot déclencheur (a noun or non-auxiliary verb in the sentence but never a preposition nor a grammar word, it can be simple or compound and it triggers the action related to the Frame_name "{frame_name}") en fonction de la description du cadre sémantique. Vous répondrez toujours en français. Si aucun élément de cadre sémantique n'est présent dans la phrase, répondez "Je ne sais pas".
    """
    template_fr = f"{template_base_french}\nFormat de réponse attendu: Task1_FrameElements = ['element_category':'cited element from sentence', 'element2_category':'second element']. Task2_Indexes = [[21,25], [35, 74]]. Task3_TriggerWord = 'cited trigger word'"
    
    prompt = PromptTemplate.from_template(template_fr)
    prompt.format(frame_description=frame_description, sentence=sent)
    print(prompt.format(frame_description=frame_description, sentence=sent))
    
    chain = (
    {
        "frame_description": itemgetter("frame_description"),
        "sentence": itemgetter("sentence"),
    }
    | prompt
    | model
    | parser
)

    result = chain.invoke({"sentence": sent, "frame_description": frame_description})
    print(result)
    return result


In [156]:
# Function to apply the model_frames function to each row in the dataframe df_frames
def extract_frame_elements(df):
    # Create a copy of the input dataframe
    df_results = df.copy()
    
    # Open the file in append mode
    with open('output/resultFrames.txt', 'a') as f:
        # Iterate over each row in the dataframe
        for index, row in df.iterrows():
            # Call the model_frames function with the values in the row
            result = model_frames(row['Sentence'], row['FrameDefinition'], row['Frame'], row['FrameElements'])
            
            # Write the index, Frame, and result to the file, separated by tabs
            f.write(f"{str(index)}\t{row['Frame']}\t{str(result)}\n")

            # Store the result in the corresponding Reponse column
            df_results.at[index, 'Reponse'] = str(result)
    
    return df_results

# Apply the extract_frame_elements function to the test_df
df_results = extract_frame_elements(df_frames)
df_results
# Export df_results to exce with headers
df_results.to_excel("df_frame_results.xlsx", index=False)