In [None]:
!pip install apify-client chromadb cohere langchainhub openai langchain pypdf faiss-cpu cohere unstructured_pytesseract unstructured_inference tiktoken

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["APIFY_API_TOKEN"] = "APIFY_API_TOKEN"
os.environ["GOOGLE_API_KEY"] = "GOOGLE_API_KEY"
os.environ["GOOGLE_CSE_ID"] = "GOOGLE_CSE_ID"

# LLM / Embeddings

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models.openai import ChatOpenAI

embeddings = OpenAIEmbeddings()
llm = ChatOpenAI(model_name='gpt-3.5-turbo-16k',temperature=0)

# Chroma DB

In [None]:
from langchain.vectorstores import Chroma

vectorstroe = Chroma(
    embedding_function = embeddings, persist_directory = "./chroma_sample33334444344444"
)

# APIFY Google Search Result Scrapper

In [None]:
from apify_client import ApifyClient

# Search URL
def search_using_apify(api_key, query):
  client = ApifyClient(api_key)
  actor_name = 'apify/google-search-scraper'
  run_input = {
    "queries": query,
    "maxPagesPerQuery": 1,
    "resultsPerPage": 10,
  }
  run = client.actor(actor_name).call(run_input=run_input)
  loader = client.dataset(run["defaultDatasetId"]).iterate_items()
#    loader = client.dataset(run["defaultDatasetId"]).iterate_items()
  temp = list()
  results = list()

  for i in loader:
    temp.append(i)

  query_results = temp[0]["organicResults"]

  for i in query_results:
    results.append(i['url'])

  return results

In [None]:
api_key = os.getenv("APIFY_API_TOKEN")
query = "2024년 반도체 시장 동향."

# WebBaseLoader

In [None]:
from langchain.document_loaders import WebBaseLoader

web_urls = search_using_apify(api_key, query)

#Multiple Webpages
loader = WebBaseLoader(web_urls)

In [None]:
web_urls

In [None]:
try:
  data = loader.load()
except Exception as e:
  print("*****************EXCEPTION*****************")
  print(e)
  pass

In [None]:
print(len(data))
print(len(data[0].page_content))

In [None]:
import re

# 정규식을 사용하여 \n 및 \t를 제거하는 함수
def remove_newline_tab(text):
    return re.sub(r'\n|\t', '', text)

def create_new_doc():
  # 데이터의 각 Document에 대해 page_content에 대한 정규식 적용
  for document in data:
      document.page_content = remove_newline_tab(document.page_content)

In [None]:
create_new_doc()

# LangChain

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)

In [None]:
db = Chroma.from_documents(texts, embeddings,persist_directory = "./chroma_sample33334444344444")

#검증절차!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
# query it
query = "2024년 반도체 시장 동향에 대해 알려주세요"
docs = db.similarity_search(query,k=2)

In [None]:
docs[0]

In [None]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm=llm, chain_type="map_reduce")

In [None]:
chain.run(input_documents=docs, question=query)