In [1]:
import os
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY_4")
azure_openai_api_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT_4")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME_4")

In [2]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain_community.document_loaders.dataframe import DataFrameLoader
# from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from IPython.display import display, Markdown
from langchain_openai import AzureOpenAIEmbeddings

In [3]:
llm = AzureChatOpenAI(api_key=azure_openai_api_key,
                        api_version="2023-12-01-preview",
                        azure_endpoint=azure_openai_api_endpoint,
                        model=deployment_name,
                        temperature=0
                        )

  llm = AzureChatOpenAI(api_key=azure_openai_api_key,


In [33]:
import pandas as pd

df = pd.read_csv('gutenberg_ebooks.csv')
df.head(1)

Unnamed: 0,Ebook ID,Author,Title,Credits,Summary,Language,LoC Class,Subject,Subject_2,Subject_3,Subject_4,Category,EBook-No.,Release Date,Most Recently Updated,Copyright Status,Downloads
0,8600,"Zola, Émile, 1840-1902",L'Assommoir,"John Bickers, Dagny and David Widger","""L'Assommoir"" by Émile Zola is a novel written...",English,PQ: Language and Literatures: Romance literatu...,Domestic fiction,Married women -- Fiction,Paris (France) -- Fiction,Working class women -- Fiction,Text,8600,"Apr 27, 2006","Sep 15, 2022",Public domain in the USA.,455 downloads in the last 30 days.


In [34]:
df['Subjects'] = df[['Subject', 'Subject_2', 'Subject_3', 'Subject_4']].apply(lambda x: ', '.join(x.dropna()), axis=1)
df = df.drop(columns=['Subject', 'Subject_2', 'Subject_3', 'Subject_4'])
df = df.rename(columns={'LoC Class': 'Library of Congress Classification'})
len(df)

19929

In [35]:
df = df[~df['Summary'].isna()]
len(df)

19110

In [36]:
df = df[df['Language']=='English']
len(df)

16585

In [37]:
df = df[df['Author']!='Various']
len(df)

15527

In [38]:
df.reset_index(inplace=True)
df.tail(1)

Unnamed: 0,index,Ebook ID,Author,Title,Credits,Summary,Language,Library of Congress Classification,Category,EBook-No.,Release Date,Most Recently Updated,Copyright Status,Downloads,Subjects
15526,19927,19999,"Trowbridge, J. T. (John Townsend), 1827-1916",The Drummer Boy,Produced by David Edwards and the Online Distr...,"""The Drummer Boy"" by J. T. Trowbridge is a his...",English,PZ: Language and Literatures: Juvenile belles ...,Text,19999,"Dec 3, 2006",,Public domain in the USA.,76 downloads in the last 30 days.,"United States -- History -- Civil War, 1861-18..."


In [39]:
df = df.iloc[:2000]
df .to_csv('gutenberg_ebooks_modified.csv')

In [4]:
loader = CSVLoader(file_path='gutenberg_ebooks_modified.csv', encoding='utf-8')
documents = loader.load()
print(len(documents))
documents[0]

2000


Document(metadata={'source': 'gutenberg_ebooks_modified.csv', 'row': 0}, page_content=': 0\nindex: 0\nEbook ID: 8600\nAuthor: Zola, Émile, 1840-1902\nTitle: L\'Assommoir\nCredits: John Bickers, Dagny and David Widger\nSummary: "L\'Assommoir" by Émile Zola is a novel written during the late 19th century, an era characterized by the realism movement in literature. The book explores the struggles of Gervaise, a laundress trying to build a life for herself and her children amidst the oppressive and often brutal conditions of working-class Paris. The story highlights themes of poverty, domestic strife, and the impact of alcoholism on individuals and families.  The opening of "L\'Assommoir" presents Gervaise in a state of despair, anxiously awaiting the return of Lantier, her partner, who has been increasingly unfaithful and irresponsible. The narrative depicts her emotional turmoil as she reflects on their meager living conditions and the struggles of single motherhood. Gervaise\'s observat

In [6]:
embeddings = AzureOpenAIEmbeddings(openai_api_key=azure_openai_api_key,
                                    azure_deployment='text-embedding-3-large',
                                    azure_endpoint=azure_openai_api_endpoint,
                                    openai_api_version="2023-05-15",
                                    chunk_size=512
)


vectorstore = FAISS.from_documents(documents, embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x7f41ed2c8f70>

In [48]:
vectorstore.save_local('faiss_vector_store')

In [53]:
vectorstore = FAISS.load_local('faiss_vector_store', embeddings=embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.1})
docs = retriever.invoke('When did the author of "L\'assommoir" die?')
docs

[Document(metadata={'source': 'gutenberg_ebooks_modified.csv', 'row': 0}, page_content=': 0\nindex: 0\nEbook ID: 8600\nAuthor: Zola, Émile, 1840-1902\nTitle: L\'Assommoir\nCredits: John Bickers, Dagny and David Widger\nSummary: "L\'Assommoir" by Émile Zola is a novel written during the late 19th century, an era characterized by the realism movement in literature. The book explores the struggles of Gervaise, a laundress trying to build a life for herself and her children amidst the oppressive and often brutal conditions of working-class Paris. The story highlights themes of poverty, domestic strife, and the impact of alcoholism on individuals and families.  The opening of "L\'Assommoir" presents Gervaise in a state of despair, anxiously awaiting the return of Lantier, her partner, who has been increasingly unfaithful and irresponsible. The narrative depicts her emotional turmoil as she reflects on their meager living conditions and the struggles of single motherhood. Gervaise\'s observa

In [54]:
prompt_template = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.

Question: {question}

Context: {context}

Answer:
"""

In [61]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate


prompt = ChatPromptTemplate.from_template(prompt_template)

# Pour faire une string contenant tous les documents retournés par le retriever, séparés par 2 retours à la ligne
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(), # la question sera parsée comme le premier argument lors de l'invocation
    }
    | prompt
    | llm
    | StrOutputParser()
)

response = qa_chain.invoke('What are the main characters of "L\'Assommoir"?')

In [60]:
df.iloc[0]['Summary']

'"L\'Assommoir" by Émile Zola is a novel written during the late 19th century, an era characterized by the realism movement in literature. The book explores the struggles of Gervaise, a laundress trying to build a life for herself and her children amidst the oppressive and often brutal conditions of working-class Paris. The story highlights themes of poverty, domestic strife, and the impact of alcoholism on individuals and families.  The opening of "L\'Assommoir" presents Gervaise in a state of despair, anxiously awaiting the return of Lantier, her partner, who has been increasingly unfaithful and irresponsible. The narrative depicts her emotional turmoil as she reflects on their meager living conditions and the struggles of single motherhood. Gervaise\'s observations of the bustling street below reveal the harsh realities faced by workers in the city, and through her interactions with Lantier, the text illustrates the complexities of love, frustration, and survival in a difficult soci

In [None]:
df.iloc[0]

In [62]:
response

'The main characters of "L\'Assommoir" are Gervaise, a laundress, and Lantier, her partner.'