In [302]:
import pandas as pd
import pymongo
from transformers import ElectraTokenizer
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever
from haystack.nodes import TransformersReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers

In [318]:
class SyncStore:  # news_DB로 이동
    def __init__(self):
        self.client = pymongo.MongoClient()
        self.mongo_data = self.client['pension']['news'].find()
        self.news_df = pd.DataFrame(self.mongo_data)
        
        self.update_document_store(self.news_df)
        
    def __del__(self):
        self.client.close()
        
    def update_document_store(self, df):
        document_store = ElasticsearchDocumentStore(host='localhost', username='root', password='1111', index='document')
        document_store.delete_documents()
        
        news_list = []
        for i in range(len(df)):
            data = df.iloc[i]
            temp = {}
            article = data['article'].strip()
            temp['content'] = article
            temp['meta'] = {'title': data['title'], 'subject': data['subject'], 'link': data['link']}
            news_list.append(temp)
            
        document_store.write_documents(news_list)
        print('MongoDB - ElasticSearch 연동이 완료되었습니다.')
        
sync = SyncStore()



MongoDB - ElasticSearch 연동이 완료되었습니다.


In [309]:
class QAmodel:  # 질문에 대한 답변만 출력 (1개) 
    def build_QA_model(self):
        tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-small-v3-discriminator')
        document_store = ElasticsearchDocumentStore(host='localhost', username='root', password='1111', index='document')
        retriever = BM25Retriever(document_store=document_store)
        reader = TransformersReader(model_name_or_path='monologg/koelectra-small-v2-distilled-korquad-384', 
                                    tokenizer='monologg/koelectra-small-v2-discriminator', 
                                    context_window_size=500,
                                    max_seq_len=500, 
                                    doc_stride=300)
        
        pipe = ExtractiveQAPipeline(reader, retriever)
        
        return pipe
    
    def input_question(self):
        query = input('질문을 입력하세요: ')
        pipe = self.build_QA_model()
        prediction = pipe.run(query=query,
                              params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 1}})
        
        return prediction
    
    def get_answer(self):
        prediction = self.input_question()
        answer = prediction['answers'][0].answer
        context = prediction['answers'][0].context
        
        start = prediction['answers'][0].offsets_in_context[0].start - 100
        if start < 0:
            start = 0
        end = prediction['answers'][0].offsets_in_context[0].start + 100

        cut = context[start:end]
        
        for line in context.split('. '):
            if line in cut and answer in line:
                print(line+'.', end=' ')

In [1]:
class NewsSearcher:  # 질문에 대한 답변과 출처 기사 링크 제공 (3개)
    def build_QA_model(self):
        tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-small-v3-discriminator')
        document_store = ElasticsearchDocumentStore(host='localhost', username='root', password='1111', index='document')
        retriever = BM25Retriever(document_store=document_store)
        reader = TransformersReader(model_name_or_path='monologg/koelectra-small-v2-distilled-korquad-384', 
                                    tokenizer='monologg/koelectra-small-v2-discriminator', 
                                    context_window_size=500,
                                    max_seq_len=500, 
                                    doc_stride=300)
        
        pipe = ExtractiveQAPipeline(reader, retriever)
        
        return pipe
    
    def input_question(self):
        query = input('질문을 입력하세요: ')
        pipe = self.build_QA_model()
        prediction = pipe.run(query=query,
                              params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}})
        
        return prediction
    
    def get_answer(self):
        prediction = self.input_question()
        links = []
        
        for i in range(3):
            answer = prediction['answers'][i].answer
            context = prediction['answers'][i].context
            title = prediction['answers'][i].meta['title']
            link = prediction['answers'][i].meta['link']
            
            if link in links:
                continue
            
            print(f'=====추천 답변=====')
            
            start = prediction['answers'][i].offsets_in_context[0].start - 100
            if start < 0:
                start = 0
            end = prediction['answers'][i].offsets_in_context[0].start + 100

            cut = context[start:end]
        
            for line in context.split('. '):
                if line in cut and answer in line:
                    print(line+'.', end=' ')
            
            print()
            print(title)
            print(link)
            links.append(link)

In [312]:
qa = QAmodel()

In [314]:
qa.get_answer()

질문을 입력하세요: 20대를 위한 연금 상품이 있나요?


INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0


20대를 위한 TDF2055도 나왔다. 

In [325]:
ns = NewsSearcher()

In [326]:
ns.get_answer()

질문을 입력하세요: 연금저축 세제 한도는 얼마인가요?


INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0


=====추천 답변=====
그런데 연금저축의 세액공제 한도는 연간 400만원이다. 
7월 디폴트옵션 도입되는 퇴직연금… 연금 백만장자 나올까
https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=004&oid=009&aid=0004915568
