In [1]:
# Credits : 
# https://github.com/lovelynrose/chatgpt/blob/main/mapreduce_keyword_extract.ipynb

In [2]:
from langchain import PromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.docstore.document import Document
import os
# import promptlayer
import pandas as pd

In [3]:
from pathlib import Path
import json

In [4]:
import langchain
langchain.__version__

'0.1.7'

# Parameters

In [5]:
%env OPENAI_API_BASE=https://api-vicuna.c0.ns1lab.net/v1

env: OPENAI_API_BASE=https://api-vicuna.c0.ns1lab.net/v1


In [6]:
%env OPENAI_API_KEY=""

env: OPENAI_API_KEY=""


In [7]:
%env LANGCHAIN_TRACING_V2=true

env: LANGCHAIN_TRACING_V2=true


In [8]:
%env LANGCHAIN_ENDPOINT=https://api.smith.langchain.com

env: LANGCHAIN_ENDPOINT=https://api.smith.langchain.com


In [9]:
%env LANGCHAIN_API_KEY=ls__7e5ee574b4a6433dafbfcf90b4caad92

env: LANGCHAIN_API_KEY=ls__7e5ee574b4a6433dafbfcf90b4caad92


In [10]:
%env LANGCHAIN_PROJECT=keyword-extraction

env: LANGCHAIN_PROJECT=keyword-extraction


In [11]:
llm = ChatOpenAI(temperature=0) #model_kwargs={"batch_size":1}

In [12]:
DATA_PATH = Path("/iga-bf")

In [13]:
# from langsmith import Client
# import os
# client = Client()
# try:
#     client.delete_project(project_name=os.environ['LANGCHAIN_PROJECT'])
# except:
#     pass

# Read Data in from json documents

The Instagram posts that we are attempting to classify are contained within a CSV file. These will be read into an array of post captions.

In [14]:
corpus = []
for doc in DATA_PATH.glob('**/*.json'):
    with doc.open(encoding="UTF-8") as source:
         corpus.append(json.load(source))

In [15]:
df = pd.DataFrame(corpus)

In [16]:
docs = [Document(page_content=row['content'], 
                       metadata=row[['title','author']]) for _,row in df.iterrows()]

# Map Prompt and Chain

The following prompt is used to develop the "map" step of the Map-Reduce chain. This prompt is run on each individual post, and is used to extract a set of "topics" local to that post. 

In [143]:
# Map Prompt 
# What should be done. Eg. Find technical words, summarise etc 
map_template = """Tu dois extraire les acronymes dans le texte délimité par ```
Sous la forme d'une liste qui contient l'acronyme et sa signification.

Les acronymes sont toujours entre () ou entre "" et en majuscule. Leur signification n'est souvent pas loin.

```{docs}```

""" 
# from_template creates instance of PromptTemplate 
# Pass template string 
map_prompt = PromptTemplate.from_template(map_template) 
# Sequential Chain 
map_chain = LLMChain(llm=llm, prompt=map_prompt,verbose=True) 

# Reduce Prompt and Chain

The following prompt is for the "reduce" step of the algorithm. It operates against the entire set of output that is produced by the "map" chain. In our example, the map chain yields a set of topics that are defined on the caption it was run on. These are then grouped together and passed as a result to this reduce chain. This reduce chain is designed to take this global output of the map step and reduce it down to a final set of unique fertility topics that minimized contextual overlap.

In [144]:
# Reduce Prompt 
# Pass the words picked from each document in map phase and make a single list 
# The word "distill" informs to keep only the most required 
reduce_template = """Ce qui suit est une liste d'acronymes avec leur signification: 

{doc_keywords}

Réalise les étapes intermédiaires suivantes : 
1. Elimine la ligne si la première lettre de chacun des mots qui compose la signification ne correspond pas avec l'acronyme.
1. Nettoie la liste pour qu'elle soit sous la forme acronyme : signification.
2. Elimine les doublons

Puis retourne la liste
"""
reduce_prompt = PromptTemplate.from_template(reduce_template) 
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt, verbose=True) 
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain 
# Pass Variable name in the LLM chain to put the documents in 
# Combines documents by stuffing into context. 
combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="doc_keywords",verbose=True) 
# Combines and iteratively reduces the mapped documents 
reduce_documents_chain = ReduceDocumentsChain( # Output of this chain is final result 
    combine_documents_chain=combine_documents_chain, # If documents exceed context for `StuffDocumentsChain` 
    collapse_documents_chain=combine_documents_chain, # The maximum number of tokens to group documents into. 
    token_max=3000,
    verbose=True
) 

In [145]:
# Combining documents by mapping a chain over them, then combining results 
map_reduce_chain = MapReduceDocumentsChain( # Map chain 
    llm_chain=map_chain, # Reduce chain 
    reduce_documents_chain=reduce_documents_chain, # The variable name in the llm_chain to put the documents in 
    document_variable_name="docs", # Return the results of the map steps in the output 
    return_intermediate_steps=False,
    verbose=True) 

# Run Map-Reduce Chain

In this step we run the actual MapReduceDocumentsChain, this will then begin to process all of the captions that were read in from the CSV file and perform the "map" step on each of them. Once the "map" step is completed opn all of them it will then run the final "reduce" step and output a final list of topics.

In [146]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0) #
docs_splitted = text_splitter.split_documents(docs)

In [147]:
len(docs_splitted)

209

In [148]:
len(docs_splitted[0].page_content)

5000

In [151]:
keywords = map_reduce_chain.invoke(docs_splitted[0:10])
# keywords = map_reduce_chain.invoke(docs_splitted[0:10], verbose=True)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mTu dois extraire les acronymes dans le texte délimité par ```
Sous la forme d'une liste qui contient l'acronyme et sa signification.

Les acronymes sont toujours entre () ou entre "" et en majuscule. Leur signification n'est souvent pas loin.

```BF2018-05-18017R -Eau potable en Guadeloupe.pub Les bonnes feuilles de l’IGA n° 2018‐05 | Juillet 2018 L e service public de l’eau potable en Guadeloupe est en situation de crise sévère, de caractère systémique, avec la multiplication des coupures, générant des risques pour la santé publique et des fortes contraintes économiques. Un plan d’action ambitieux est donc indispensable et urgent. La gravité de la situation appelle une solution forte et globale, avec des mesures devant être efficaces à court terme, mais devant également préserver l’avenir et préparer le retour à une situation normale A la lumière d

BadRequestError: Error code: 400 - {'object': 'error', 'message': 'Server disconnected', 'code': 50001}

In [None]:
print(keywords['output_text'])

In [1]:
# Credits : 
# https://github.com/lovelynrose/chatgpt/blob/main/mapreduce_keyword_extract.ipynb

In [2]:
from langchain import PromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.docstore.document import Document
import os
# import promptlayer
import pandas as pd

In [3]:
from pathlib import Path
import json

In [4]:
import langchain
langchain.__version__

'0.1.7'

# Parameters

In [5]:
%env OPENAI_API_BASE=https://api-vicuna.c0.ns1lab.net/v1

env: OPENAI_API_BASE=https://api-vicuna.c0.ns1lab.net/v1


In [6]:
%env OPENAI_API_KEY=""

env: OPENAI_API_KEY=""


In [7]:
%env LANGCHAIN_TRACING_V2=true

env: LANGCHAIN_TRACING_V2=true


In [8]:
%env LANGCHAIN_ENDPOINT=https://api.smith.langchain.com

env: LANGCHAIN_ENDPOINT=https://api.smith.langchain.com


In [9]:
%env LANGCHAIN_API_KEY=ls__7e5ee574b4a6433dafbfcf90b4caad92

env: LANGCHAIN_API_KEY=ls__7e5ee574b4a6433dafbfcf90b4caad92


In [10]:
%env LANGCHAIN_PROJECT=keyword-extraction

env: LANGCHAIN_PROJECT=keyword-extraction


In [11]:
llm = ChatOpenAI(temperature=0) #model_kwargs={"batch_size":1}

In [12]:
DATA_PATH = Path("/iga-bf")

In [13]:
# from langsmith import Client
# import os
# client = Client()
# try:
#     client.delete_project(project_name=os.environ['LANGCHAIN_PROJECT'])
# except:
#     pass

# Read Data in from json documents

The Instagram posts that we are attempting to classify are contained within a CSV file. These will be read into an array of post captions.

In [14]:
corpus = []
for doc in DATA_PATH.glob('**/*.json'):
    with doc.open(encoding="UTF-8") as source:
         corpus.append(json.load(source))

In [15]:
df = pd.DataFrame(corpus)

In [16]:
docs = [Document(page_content=row['content'], 
                       metadata=row[['title','author']]) for _,row in df.iterrows()]

# Map Prompt and Chain

The following prompt is used to develop the "map" step of the Map-Reduce chain. This prompt is run on each individual post, and is used to extract a set of "topics" local to that post. 

In [143]:
# Map Prompt 
# What should be done. Eg. Find technical words, summarise etc 
map_template = """Tu dois extraire les acronymes dans le texte délimité par ```
Sous la forme d'une liste qui contient l'acronyme et sa signification.

Les acronymes sont toujours entre () ou entre "" et en majuscule. Leur signification n'est souvent pas loin.

```{docs}```

""" 
# from_template creates instance of PromptTemplate 
# Pass template string 
map_prompt = PromptTemplate.from_template(map_template) 
# Sequential Chain 
map_chain = LLMChain(llm=llm, prompt=map_prompt,verbose=True) 

# Reduce Prompt and Chain

The following prompt is for the "reduce" step of the algorithm. It operates against the entire set of output that is produced by the "map" chain. In our example, the map chain yields a set of topics that are defined on the caption it was run on. These are then grouped together and passed as a result to this reduce chain. This reduce chain is designed to take this global output of the map step and reduce it down to a final set of unique fertility topics that minimized contextual overlap.

In [144]:
# Reduce Prompt 
# Pass the words picked from each document in map phase and make a single list 
# The word "distill" informs to keep only the most required 
reduce_template = """Ce qui suit est une liste d'acronymes avec leur signification: 

{doc_keywords}

Réalise les étapes intermédiaires suivantes : 
1. Elimine la ligne si la première lettre de chacun des mots qui compose la signification ne correspond pas avec l'acronyme.
1. Nettoie la liste pour qu'elle soit sous la forme acronyme : signification.
2. Elimine les doublons

Puis retourne la liste
"""
reduce_prompt = PromptTemplate.from_template(reduce_template) 
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt, verbose=True) 
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain 
# Pass Variable name in the LLM chain to put the documents in 
# Combines documents by stuffing into context. 
combine_documents_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="doc_keywords",verbose=True) 
# Combines and iteratively reduces the mapped documents 
reduce_documents_chain = ReduceDocumentsChain( # Output of this chain is final result 
    combine_documents_chain=combine_documents_chain, # If documents exceed context for `StuffDocumentsChain` 
    collapse_documents_chain=combine_documents_chain, # The maximum number of tokens to group documents into. 
    token_max=3000,
    verbose=True
) 

In [145]:
# Combining documents by mapping a chain over them, then combining results 
map_reduce_chain = MapReduceDocumentsChain( # Map chain 
    llm_chain=map_chain, # Reduce chain 
    reduce_documents_chain=reduce_documents_chain, # The variable name in the llm_chain to put the documents in 
    document_variable_name="docs", # Return the results of the map steps in the output 
    return_intermediate_steps=False,
    verbose=True) 

# Run Map-Reduce Chain

In this step we run the actual MapReduceDocumentsChain, this will then begin to process all of the captions that were read in from the CSV file and perform the "map" step on each of them. Once the "map" step is completed opn all of them it will then run the final "reduce" step and output a final list of topics.

In [146]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0) #
docs_splitted = text_splitter.split_documents(docs)

In [147]:
len(docs_splitted)

209

In [148]:
len(docs_splitted[0].page_content)

5000

In [None]:
keywords = map_reduce_chain.invoke(docs_splitted[0:5])
# keywords = map_reduce_chain.invoke(docs_splitted[0:10], verbose=True)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mTu dois extraire les acronymes dans le texte délimité par ```
Sous la forme d'une liste qui contient l'acronyme et sa signification.

Les acronymes sont toujours entre () ou entre "" et en majuscule. Leur signification n'est souvent pas loin.

```BF2018-05-18017R -Eau potable en Guadeloupe.pub Les bonnes feuilles de l’IGA n° 2018‐05 | Juillet 2018 L e service public de l’eau potable en Guadeloupe est en situation de crise sévère, de caractère systémique, avec la multiplication des coupures, générant des risques pour la santé publique et des fortes contraintes économiques. Un plan d’action ambitieux est donc indispensable et urgent. La gravité de la situation appelle une solution forte et globale, avec des mesures devant être efficaces à court terme, mais devant également préserver l’avenir et préparer le retour à une situation normale A la lumière d

In [None]:
print(keywords['output_text'])