In [None]:
!pip install pycoingecko requests tiktoken cohere openai pinecone-client langchain apify-client

###1.IMPORT LIBRARY

In [None]:
import os
import requests
import json
from datetime import datetime

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders.base import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.utilities import ApifyWrapper
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Vectara
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser

###2.KEY SETTINGS

In [None]:
os.environ["OPENAI_API_KEY"]      = "OPENAI_API_KEY"
os.environ["VECTARA_CUSTOMER_ID"] = "VECTARA_CUSTOMER_ID"
os.environ["VECTARA_CORPUS_ID"]   = "VECTARA_CORPUS_ID"
os.environ["VECTARA_API_KEY"]     = "VECTARA_API_KEY"
os.environ["APIFY_API_TOKEN"]   = "APIFY_API_TOKEN"

##3.Import Data

###3.Data Source: APIFY Google Search Result

In [None]:
queries = "semiconductor forecast gartner"

In [None]:
from apify_client import ApifyClient
# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_TOKEN"))

# Prepare the Actor input
run_input = {
    "queries": queries,
    "maxPagesPerQuery": 1,
    "resultsPerPage": 100,
    "customDataFunction": """async ({ input, $, request, response, html }) => {
  return {
    pageTitle: $('title').text(),
  };
};""",
}

# Run the Actor and wait for it to finish
run = client.actor("apify/google-search-scraper").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
loader = client.dataset(run["defaultDatasetId"]).iterate_items()

In [None]:
temp = list()
results = list()

for i in loader:
  temp.append(i)

print(temp)

In [None]:
data = temp[0]['organicResults']

In [None]:
#2023년 데이터만 
filtered_data = [item for item in data if 'date' in item and datetime.fromisoformat(item['date'][:-1]).year == 2023]

In [None]:
urls = []

for item in filtered_data:
  urls.append(item['url'])

###APIFY Google Search Result

In [None]:
urls

###4.INITIALIZE VECTARA

In [None]:
vectara = Vectara(
    vectara_customer_id = os.getenv("VECTARA_CUSTOMER_ID"),
    vectara_corpus_id   = os.getenv("VECTARA_CORPUS_ID"),
    vectara_api_key     = os.getenv("VECTARA_API_KEY")
)

#Google Search Result[APIFY] - WebContents Crawler[APIFY] - CHATGPT - VECTARA

###5.APIFY Web Contents Crawling

In [None]:
# Initialize the ApifyClient with your API token
apify = ApifyWrapper()

startUrls = [{"url": url} for url in urls]

#print(startUrls)

# Prepare the Actor input
run_input = {
    "startUrls": startUrls,
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
}

# Run the Actor and wait for it to finish
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input=run_input,
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [None]:
#LLM Setting
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")

In [None]:
documents = loader.load()

###6.VECTARA DATA 저장

In [None]:
vectorstore = Vectara.from_documents(documents, embedding=None)

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
retriever = vectorstore.as_retriever(lambda_val=0.025, k=5, filter=None)

In [None]:
llm = ChatOpenAI(model='gpt-3.5-turbo-16k')

In [None]:
template = """
SYSTEM
You are an expert researcher and writer, tasked with answering any question.
Generate a comprehensive and informative, yet concise answer of 250 words or less for the given question based solely on the provided search results (URL and content).
You must only use information from the provided search results. Use an unbiased and journalistic tone. Combine search results together into a coherent answer.
Do not repeat text. Cite search results using [${{number}}] notation. Only cite the most relevant results that answer the question accurately.
Place these citations at the end of the sentence or paragraph that reference them - do not put them all at the end.
If different results refer to different entities within the same name, write separate answers for each entity.
If you want to cite multiple results for the same sentence, format it as `[${{number1}}] [${{number2}}]`.
However, you should NEVER do this with the same number - if you want to cite `number1` multiple times for a sentence, only do `[${{number1}}]` not `[${{number1}}] [${{number1}}]`
You should use bullet points in your answer for readability. Put citations where they apply rather than putting them all at the end.
If there is nothing in the context relevant to the question at hand, just say "Hmm, I'm not sure." Don't try to make up an answer.
Anything between the following `context` html blocks is retrieved from a knowledge bank, not part of the conversation with the user.
You must answer in Korean.

<context>
    {context}
<context/>

HUMAN
{question}
  """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
retrieval_chain = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | llm
      | StrOutputParser()
  )

In [None]:
retrieval_chain.invoke("가트너 뉴스 기준으로 2024년 반도체 시장 현황 및 전망이 알고싶어요")