In [None]:
!pip install apify-client  chromadb tiktoken cohere langchainhub openai langchain

In [None]:
from langchain.document_loaders.base import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.utilities import ApifyWrapper
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter



###https://python.langchain.com/docs/integrations/tools/apify


In [None]:
import os

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["APIFY_API_TOKEN"] = "APIFY_API_TOKEN"

In [None]:
# Vectorstore
from langchain.vectorstores import Chroma

vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(), persist_directory="./chroma_db_apify"
)


###https://apify.com/apify/website-content-crawler/input-schema

##web scraping python
###https://blog.apify.com/web-scraping-python/

##보고하시오
###https://apify.com/apify/website-content-crawler/api/client/python

In [None]:
# LLM
from langchain.chat_models import ChatOpenAI

# ChatGPT 모델 지정
#llm = ChatOpenAI(model_name="gpt-4-0613", temperature=0)
llm = ChatOpenAI(model_name='gpt-3.5-turbo-1106',temperature=0)

In [None]:
apify = ApifyWrapper()

#Prepare the Actor input
run_input = {
    "startUrls": [{ "url": "https://python.langchain.com/en/latest/" }],
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
}

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input=run_input,
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [None]:
document = loader.load()

In [None]:
document

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
texts = text_splitter.split_documents(document)

In [None]:
embeddings = OpenAIEmbeddings()

# Chroma DB 에 저장
db = Chroma.from_documents(texts,embeddings,persist_directory="./chroma_db3_apify")

In [None]:
query = "랭체인 라이브러리에 대해 설명해주세요"
docs = await db.asimilarity_search(query)
print(docs[0].page_content)

In [None]:
# retriever 가져옴
retriever = db.as_retriever()

In [None]:
# langchain hub 에서 Prompt 다운로드 예시
# https://smith.langchain.com/hub/rlm/rag-prompt

from langchain import hub

rag_prompt = hub.pull("rlm/rag-prompt")
rag_prompt

In [None]:
# RAG chain 생성
from langchain.schema.runnable import RunnablePassthrough

# pipe operator를 활용한 체인 생성
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)

In [None]:
query = "langchain이 뭐냐"
rag_chain.invoke(query)

In [None]:
query = "langchain의 agent가 뭐냐?"
rag_chain.invoke(query)

#APIFY Google Search result scraper

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_TOKEN"))

# Prepare the Actor input
run_input = {
    "queries": "2024년 한국 반도체 시장의 전망",
    "maxPagesPerQuery": 1,
    "resultsPerPage": 100,
    "customDataFunction": """async ({ input, $, request, response, html }) => {
  return {
    pageTitle: $('title').text(),
  };
};""",
}

# Run the Actor and wait for it to finish
run = client.actor("apify/google-search-scraper").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)