# Step 1 - Install Dependencies

In [None]:
!pip install pycoingecko requests tiktoken cohere openai langchain apify-client langchainhub pymupdf lark

In [None]:
import os

os.environ["APIFY_API_TOKEN"]   = "YOUR_APIFY_API_TOKEN"

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

os.environ["VECTARA_CUSTOMER_ID"] = "YOUR_VECTARA_CUSTOMER_ID"
os.environ["VECTARA_CORPUS_ID"] = "YOUR_VECTARA_CORPUS_ID"
os.environ["VECTARA_API_KEY"] = "YOUR_VECTARA_API_KEY"

In [None]:
import numpy as np
import datetime as dt
import openai
import re
import time

# Step 2 - Import Data

## Data Source 1/3: APIFY Google Search Result

In [None]:
keyword_queries = "South Korean semiconductor challenges"

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_TOKEN"))

# Prepare the Actor input
run_input = {
    "queries": keyword_queries,
    "maxPagesPerQuery": 1,
    "resultsPerPage": 100,
    "customDataFunction": """async ({ input, $, request, response, html }) => {
  return {
    pageTitle: $('title').text(),
  };
};""",
}

# Run the Actor and wait for it to finish
run = client.actor("apify/google-search-scraper").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
loader = client.dataset(run["defaultDatasetId"]).iterate_items()

In [None]:
temp = list()
results = list()

for i in loader:
  temp.append(i)

print(temp)

In [None]:
data = temp[0]['organicResults']
data

In [None]:
dates = [result.get('date') for result in temp[0]['organicResults'] if 'date' in result]
print(dates)

In [None]:
from datetime import datetime

In [None]:
#2023년 데이터
filtered_data = [item for item in data if 'date' in item and datetime.fromisoformat(item['date'][:-1]).year == 2023]

In [None]:
urls = []
dates = []
for item in filtered_data:
  urls.append(item['url'])
  dates.append(item['date'])

In [None]:
print(urls)
print(dates)

In [None]:
min_date = min(dates)
print(min_date)

## Data Source 2/3: APIFY Web Contents Crawling

In [None]:
from langchain.utilities import ApifyWrapper
from langchain_core.documents.base import Document
import os

# Initialize the ApifyClient with your API token
apify = ApifyWrapper()

startUrls = [{"url": url} for url in urls]

#print(startUrls)

# Prepare the Actor input
run_input = {
    "startUrls": startUrls,
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
}

# Run the Actor and wait for it to finish
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input=run_input,
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

## Data Source 3/3 - Vectara Initialize

In [None]:
from langchain.vectorstores import Vectara

vectara = Vectara(
      vectara_customer_id = os.getenv("VECTARA_CUSTOMER_ID")
    , vectara_corpus_id   = os.getenv("VECTARA_CORPUS_ID")
    , vectara_api_key     = os.getenv("VECTARA_API_KEY")
)

In [None]:
documents = loader.load()

In [None]:
vectera = Vectara.from_documents(
      documents
    , embedding=None
    , doc_metadata={"category":"knowledgebase"
                  , "keyword":keyword_queries
                  , "date":min_date
                }
)

# Step 3 - Vectara 리트리버 및 Chain&Agent 생성

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.agents import AgentType
from langchain.agents import initialize_agent, Tool

## Vectara 리트리버 생성

### Vectara Metadata Filter 테스트

In [None]:
query="반도체시장 전망에 대해 알려주세요."
selected_keywords = ['Semiconductor industry outlook']

# selected_keywords 여러개 일 때 IN 절에 사용할 문자열 생성
keywords_str = ', '.join([f"'{kw}'" for kw in selected_keywords])

# 필터 구성
filters = f"doc.keyword IN ({keywords_str})"
print(filter)

In [None]:
# selected_keywords 한 개 일 경우
keyword_string = ' '.join(selected_keywords)

In [None]:
# Vectara Metadata filter IN절 안됨
found_docs = vectara.similarity_search(
    query, n_sentence_context=0, filter="doc.keyword IN 'Semiconductor'"
)

# found_docs = vectara.similarity_search(
#     query, n_sentence_context=0, filter=filters
# )

found_docs

In [None]:
# Vectara Metadata filter =은 적용됨
query="2024년 반도체시장 전망에 대해 알려주세요."
filters = f"doc.keyword = '{keyword_string}'"

found_docs = vectara.similarity_search(
    query, n_sentence_context=0, filter=filters
)

# found_docs = vectara.similarity_search(
#     query, k=10, n_sentence_context=0, filter="doc.keyword = 'Semiconductor industry outlook'"
# )

In [None]:
print(found_docs[0].page_content)

In [None]:
for doc in found_docs:
    print("Page Content:", doc.page_content)
    print("Metadata:", doc.metadata)
    print("\n")

In [None]:
search_kwargs = {
    "filter":{"doc.keyword = 'Semiconductor industry outlook'"}

}
retriever=vectara.as_retriever(search_type="similarity",search_kwargs=search_kwargs)

In [None]:
# Retrieverd에서 qeury실행 시 가져오 결과값과 관련된 문서 출력
query = "2024 반도체 시장 전망"
retriever.get_relevant_documents(query)

### RetrievalQA vectara 리트리버 filter 테스트

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

# Setup LLM
#llm = ChatOpenAI(model_name="gpt-4-0613", temperature=0)
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0, streaming=True
)

search_kwargs = {
    "filter":{"doc.keyword = 'Semiconductor market trends'"}

}
qa_filter = RetrievalQA.from_llm(llm=llm, retriever=vectara.as_retriever(search_type="similarity",search_kwargs=search_kwargs))
qa_filter({"query":"2024반도체 시장 전망", })

## 프롬프트 생성

In [None]:
# Create Prompt
question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question.
Return any relevant text verbatim.
{context}
Question: {question}
Relevant text, if any:"""

#prompt = ChatPromptTemplate.from_template(question_prompt_template)

QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

## RetrievalQAWithSources Chain 생성

In [None]:
chain_type_kwargs = {"verbose": True,  "question_prompt": QUESTION_PROMPT}

qa = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="map_reduce",
                                retriever=retriever, return_source_documents=True,
                                chain_type_kwargs=chain_type_kwargs)
response = qa({"question": "반도체 시장에서 우려되는 점 5가지", "verbose": True})
print("*********************************")
print(response.keys())
print(response["answer"])
print(response["sources"])

## RetrievalQA Chain 생성

In [None]:
# Create RetrievalQAChain

knowledgeBase_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    input_key="query",
    verbose=True,
    chain_type_kwargs={
        "prompt": QUESTION_PROMPT
    },
)

In [None]:
query = "반도체 시장에서 우려되는 점 알려주세요."
response = knowledgeBase_qa({"query": query},return_only_outputs=True)

In [None]:
response.keys()

In [None]:
def run_qa_chain(query):
    results = knowledgeBase_qa({"query": query},return_only_outputs=True)
    return str(results)

## Tool 생성

In [None]:
# Search
# search = GoogleSearchAPIWrapper()

tools = [
    Tool(
        name='Knowledge Base',
        func=run_qa_chain,
        return_direct=True,
        description=(f'''
            use this tool when answering general knowledge queries to get
            more information about the topic
            '''
        )
    )

]

## Agent Template 생성

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import MessagesPlaceholder

In [None]:
agent_template = """You are an expert at extracting information from your knowledge base.
When searching for the first time, a search is performed in the knowledge base,
and if there are insufficient or no results, a Google search is performed and the results are displayed.
Be sure to answer in Korean!
{memory}
Human: {human_input}
Chatbot:"""

agent_prompt = PromptTemplate(input_variables=["memory", "human_input"],template=agent_template)

agent_memory = ConversationBufferMemory(memory_key="memory",prompt=agent_prompt, return_messages=True)

agent_kwargs = {
            "extra_prompt_messages": [MessagesPlaceholder(variable_name="agent_memory")],
        }

## Initialize Agent

In [None]:
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    agent_kwargs=agent_kwargs,
    memory=agent_memory,
    max_execution_time=1,
    early_stopping_method="generate",
)

In [None]:
agent.run("반도체 시장에서 우려되는 점 알려주세요.")

In [None]:
response = agent(
    {
        "input": "2024년 반도체 시장의 전망과 예측"
    }
)

In [None]:
source_documents = retriever.get_relevant_documents(
    "2024년 반도체 시장의 전망과 예측"
)

In [None]:
# print(extract_result())

for i in range(len(source_documents)):
  #print(source_documents[i].page_content[0:100])
  print(source_documents[i].metadata)