In [None]:
!pip install newsapi-python requests tiktoken cohere openai pinecone-client langchain playwright beautifulsoup4 html2text apify-client

###1.IMPORT LIBRARY

In [None]:
import os
import requests
import json

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders.base import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.utilities import ApifyWrapper
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Vectara
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

###2.KEY SETTINGS

In [None]:
# Create a variable to hold the API key
os.environ["NEWS_API_KEY"]        = "NEWS_API_KEY"
os.environ["OPENAI_API_KEY"]      = "OPENAI_API_KEY"
os.environ["VECTARA_CUSTOMER_ID"] = "VECTARA_CUSTOMER_ID"
os.environ["VECTARA_CORPUS_ID"]   = "VECTARA_CORPUS_ID"
os.environ["VECTARA_API_KEY"]     = "VECTARA_API_KEY"

# Create a variable to hold the API key
headers = {'Authorization': os.getenv("NEWS_API_KEY")}

###3.NEWSAPI Running

In [None]:
# create 2 variables to hold the API endpoints.
everything = "https://newsapi.org/v2/everything?"

In [None]:
# create 미국꺼
keywords_en = 'semiconductor AND (market OR forecast OR outlook OR insight OR predict OR status)'

In [None]:
# create 한국꺼
keywords_kr = '반도체 AND (시장 OR 예측 OR 전망 OR 현황)'

In [None]:
sources = ['Engadget', 'Techmeme','Slashdot','financial-post', 'google-news',
           'reuters','nbc-news', 'techcrunch', 'the-wall-street-journal']

In [None]:
#Links 지정
domains = ["LINKS"]

In [None]:
#excludeDomains = ['biztoc.com','techmeme.com']

In [None]:
sortby = "relevancy"    #relevancy/popularity

In [None]:
searchin = 'title,description'

In [None]:
#'excludeDomains':excludeDomains,
params= {
        'q': keywords_en,
        'apiKey': os.getenv("NEWS_API_KEY"),
        'sortBy': sortby,
        'searchin':searchin,
        }

In [None]:
response = requests.get(url=everything, headers=headers, params=params)

In [None]:
output = response.json()

In [None]:
article_urls = [article["url"] for article in output["articles"]]

###3-1.NEWSAPI Result

In [None]:
article_urls

###4.INITIALIZE VECTARA

In [None]:
vectara = Vectara(
    vectara_customer_id = os.getenv("VECTARA_CUSTOMER_ID"),
    vectara_corpus_id   = os.getenv("VECTARA_CORPUS_ID"),
    vectara_api_key     = os.getenv("VECTARA_API_KEY")
)

#NEWSAPI - APIFY - CHATGPT - VECTARA

###5.APIFY Web Contents Crawling

In [None]:
# Initialize the ApifyClient with your API token
apify = ApifyWrapper()
# 10개 => 연결 페이지 스크래핑

urls = article_urls[:10]

startUrls = [{"url": url} for url in urls]

#print(startUrls)

# Prepare the Actor input
run_input = {
    "startUrls": startUrls,
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
}

# Run the Actor and wait for it to finish
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input=run_input,
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [None]:
documents = loader.load()

###6.VECTARA DATA 저장

In [None]:
vectorstore = Vectara.from_documents(documents, embedding=None)

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
retriever = vectorstore.as_retriever(lambda_val=0.025, k=5, filter=None)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [None]:
template = """
SYSTEM
You are an expert researcher and writer, tasked with answering any question.
Generate a comprehensive and informative, yet concise answer of 250 words or less for the given question based solely on the provided search results (URL and content).
You must only use information from the provided search results. Use an unbiased and journalistic tone. Combine search results together into a coherent answer.
Do not repeat text. Cite search results using [${{number}}] notation. Only cite the most relevant results that answer the question accurately.
Place these citations at the end of the sentence or paragraph that reference them - do not put them all at the end.
If different results refer to different entities within the same name, write separate answers for each entity.
If you want to cite multiple results for the same sentence, format it as `[${{number1}}] [${{number2}}]`.
However, you should NEVER do this with the same number - if you want to cite `number1` multiple times for a sentence, only do `[${{number1}}]` not `[${{number1}}] [${{number1}}]`
You should use bullet points in your answer for readability. Put citations where they apply rather than putting them all at the end.
If there is nothing in the context relevant to the question at hand, just say "Hmm, I'm not sure." Don't try to make up an answer.
Anything between the following `context` html blocks is retrieved from a knowledge bank, not part of the conversation with the user.
You must answer in Korean.

<context>
    {context}
<context/>

HUMAN
{question}
  """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")

In [None]:
retrieval_chain = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | llm
      | StrOutputParser()
  )

In [None]:
retrieval_chain.invoke("What about the silicon/wafer market?")

In [None]:
retrieval_chain.invoke("세계 반도체 산업의 현황을 알고싶다.")

In [None]:
retrieval_chain.invoke("I want to know the current status of the global semiconductor industry.")