In [None]:
!pip install cohere tiktoken langchainhub langchain openai chromadb apify-client

# APIFY Google Search result scraper

In [None]:
import os

os.environ["APIFY_API_TOKEN"] = "APIFY_API_TOKEN"

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_TOKEN"))

# Prepare the Actor input
run_input = {
    "queries": "2024년 한국 반도체 시장의 전망",
    "maxPagesPerQuery": 1,
    "resultsPerPage": 100,
    "customDataFunction": """async ({ input, $, request, response, html }) => {
  return {
    pageTitle: $('title').text(),
  };
};""",
}

# Run the Actor and wait for it to finish
run = client.actor("apify/google-search-scraper").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)

In [None]:
client.dataset(run["defaultDatasetId"]).

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain

In [None]:
url_list = [
    'https://www.digitaltoday.co.kr/news/articleView.html?idxno=487479',
    'https://m.ddaily.co.kr/page/view/2023091211081680819',
    'https://news.mt.co.kr/mtview.php?no=2023101214124899784',
    'https://www.thelec.kr/news/articleView.html?idxno=23442',
    'https://www.the-stock.kr/news/articleView.html?idxno=18909',
    'https://zdnet.co.kr/view/?no=20231115083845'
]

In [None]:
loader = WebBaseLoader(url_list)

In [None]:
try:
  data = loader.load()
except Exception as e:
  #except발생하는 애들 그냥 무시(이래도 됨)
  pass

In [None]:
len(data)

In [None]:
len(data[0].page_content)

In [None]:
import re

# 정규식을 사용하여 \n 및 \t를 제거하는 함수
def remove_newline_tab(text):
    return re.sub(r'\n|\t', '', text)

def create_new_doc():
  # 데이터의 각 Document에 대해 page_content에 대한 정규식 적용
  for document in data:
      document.page_content = remove_newline_tab(document.page_content)


In [None]:
create_new_doc()

In [None]:
data

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["SERPAPI_API_KEY"] = "YOUR SERPAPI_KEY"

In [None]:
# LLM
from langchain.chat_models import ChatOpenAI

# ChatGPT 모델 지정
#llm = ChatOpenAI(model_name="gpt-4-0613", temperature=0)
llm = ChatOpenAI(model_name='gpt-3.5-turbo-1106',temperature=0)

In [None]:
# Vectorstore
vectordb = Chroma(
    embedding_function=OpenAIEmbeddings(), persist_directory="./chroma_wonik_testxxxx"
)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
texts = text_splitter.split_documents(data)


In [None]:
embeddings = OpenAIEmbeddings()
# Chroma DB 에 저장
db = Chroma.from_documents(texts, embeddings)

In [None]:
# query it
query = "2024년 한국 반도체 업계 전망"
docs = db.similarity_search(query,k=2)

In [None]:
chain = load_qa_chain(llm=llm, chain_type="stuff")

In [None]:
chain.run(input_documents=docs, question=query)