In [None]:
import pandas as pd
import itertools
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

In [None]:
# all_df = pd.read_excel('all_df.xlsx')
before_covid_umap_df = pd.read_excel('before_covid_umap_df(20) (2).xlsx')
covid_umap_df = pd.read_excel('covid_umap_df(20) (2).xlsx')
after_covid_umap_df = pd.read_excel('after_covid_umap_df(20) (2).xlsx')

In [None]:
llm = ChatOpenAI(
    temperature=0,
    openai_api_key=""
)

In [None]:
map_template = """The following is a set of documents
{docs}
Based on this list of docs, you should identify the main themes in Korean.
Instead of simply listing the content, please provide the overarching themes based on the technologies and fields that run through each docs in three sentences or less.

This is a sample answer : 주어진 문서는 초지 이용 기술 개발과 체험관광형 목장 이용 모델 설정을 통해 초지의 부가가치 창출과 방문객 만족도 향상을 도모하고자 하는 연구 내용을 담고 있습니다.

Helpful Answer:"""

prompt_template = PromptTemplate.from_template(
    template=map_template
)

map_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
reduce_template = """The following is set of summaries in Korean:
{docs}
Take these and distill it into a final, consolidated summary of the main themes in Korean.
Instead of simply listing the content, please provide the overarching themes based on the technologies and fields that run through each docs in three sentences or less.

This is a sample answer : 주어진 문서는 초지 이용 기술 개발과 체험관광형 목장 이용 모델 설정을 통해 초지의 부가가치 창출과 방문객 만족도 향상을 도모하고자 하는 연구 내용을 담고 있습니다.

Helpful Answer:"""

reduce_prompt = PromptTemplate.from_template(reduce_template)

In [None]:
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

reduce_documents_chain = ReduceDocumentsChain(
    # 아래가 마지막 체인
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=combine_documents_chain # If documents exceed context for`StuffDocumentsChain` 
    
#    token_max=4000 # 문서 그룹화 할 최대 토큰 개수
)

In [None]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    
    document_variable_name="docs",
    return_intermediate_steps=False # output에 map_steps 결과 리턴 여부
)


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True)

In [None]:
sum_up_lst2 = [[], [], []] # 프롬프트

df_lst = [before_covid_umap_df, covid_umap_df, after_covid_umap_df]

In [None]:
for idx, i in enumerate(df_lst):
    cluster_num = 0
    
    while cluster_num in i['cluster'].values:
        cluster = list(i.loc[i['cluster'] == cluster_num, '초록(국문)'])
        split_docs = text_splitter.create_documents(cluster)

        result = map_reduce_chain.run(split_docs)    
        sum_up_lst2[idx].append(result)
        
        cluster_num += 1

In [None]:
sum_up_lst2[0][1]

In [None]:
sum_up_lst2[2]

In [None]:
all_df = pd.read_excel('all_df (2).xlsx')

In [None]:


all_lst = [all_df]
all_result = []

for idx, i in enumerate(all_lst):
    cluster_num = 0
    
    while cluster_num in i['cluster'].values:
        cluster = list(i.loc[i['cluster'] == cluster_num, '초록(국문)'])
        split_docs = text_splitter.create_documents(cluster)

        result = map_reduce_chain.run(split_docs)    
        all_result.append(result)
        
        cluster_num += 1
        

In [None]:
all_result

In [None]:
len(all_result)