In [36]:
import pandas as pd
import itertools
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

In [37]:
# all_df = pd.read_excel('all_df.xlsx')
before_covid_umap_df = pd.read_excel('before_covid_umap_df(20) (2).xlsx')
covid_umap_df = pd.read_excel('covid_umap_df(20) (2).xlsx')
after_covid_umap_df = pd.read_excel('after_covid_umap_df(20) (2).xlsx')

In [38]:
llm = ChatOpenAI(
    temperature=0,
    openai_api_key=""
)

In [39]:
# map_template = """The following is a set of documents
# {docs}
# Based on this list of docs, please identify the main themes in Korean.
# Helpful Answer:"""

map_template = """The following is a set of documents
{docs}
Based on this list of docs, you should identify the main themes in Korean.
Instead of simply listing the content, please provide the overarching themes based on the technologies and fields that run through each docs.
Helpful Answer:"""

prompt_template = PromptTemplate.from_template(
    template=map_template
)

map_chain = LLMChain(llm=llm, prompt=prompt_template)

In [40]:
# reduce_template = """The following is set of summaries in Korean:
# {docs}
# Take these and distill it into a final, consolidated summary of the main themes in 500 characters or less in Korean. 
# Helpful Answer:"""

reduce_template = """The following is set of summaries in Korean:
{docs}
Take these and distill it into a final, consolidated summary of the main themes three sentences or less in Korean. 
Helpful Answer:"""

reduce_prompt = PromptTemplate.from_template(reduce_template)

In [41]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain
    # The maximum number of tokens to group documents into.
#    token_max=4000,
)

In [42]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)


In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True)

In [44]:
sum_up_lst2 = [[], [], []] # 프롬프트

df_lst = [before_covid_umap_df, covid_umap_df, after_covid_umap_df]

In [45]:
for idx, i in enumerate(df_lst):
    cluster_num = 0
    
    while cluster_num in i['cluster'].values:
        cluster = list(i.loc[i['cluster'] == cluster_num, '초록(국문)'])
        split_docs = text_splitter.create_documents(cluster)

        result = map_reduce_chain.run(split_docs)    
        sum_up_lst2[idx].append(result)
        
        cluster_num += 1

In [None]:
sum_up_lst2

In [None]:
sum_up_lst2[0]

In [48]:
sum_up_lst2[0][0]

'주어진 문서를 기반으로 한국어의 주요 테마는 다음과 같습니다:\n\n1. 고부가 신규상품 개발에 따른 브랜드인지도 향상\n2. 총생산(GRDP) 증대\n3. 창업 및 기존 기업의 매출증대로 인해 총생산 증대\n4. 3차 산업의 활성화 토대\n5. 역내 제품 고부가가치화를 통한 고급시장 선점 확대\n6. 창업 기업 증가로 인한 고용 및 세수 증대\n7. 지역 인견 산업의 세계적 명품화에 따른 인구유입증가 효과\n\n이를 요약하면 다음과 같습니다:\n고부가 신규상품 개발을 통한 브랜드인지도 향상, 총생산 증대, 창업 및 기업 매출증대로 인한 총생산 증대, 3차 산업 활성화, 역내 제품 고부가가치화를 통한 고급시장 선점 확대, 창업 기업 증가로 인한 고용 및 세수 증대, 지역 인견 산업의 세계적 명품화에 따른 인구유입증가 효과.'

In [None]:
sum_up_lst2[1]

In [None]:
sum_up_lst2[2]