# Step 1 - Install Dependencies

In [None]:
!pip install pycoingecko requests tiktoken cohere openai pinecone-client langchain apify-client

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
os.environ["APIFY_API_TOKEN"]   = "APIFY_API_KEY"
os.environ["PINECONE_API_KEY"]  = 'PINECONE_API_KEY'
os.environ["PINECONE_ENV"]      = 'PINECONE_ENV' #gcp-starter
os.environ["INDEX_NAME"]        = 'INDEX_NAME'

# Step 2 - Import Data

In [None]:
import numpy as np
import datetime as dt
import openai
import re
import time
import pinecone

### Data Source 1/2: APIFY Google Search Result

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_KEY"))

# Prepare the Actor input
run_input = {
    "queries": "gartner semiconductor",
    "maxPagesPerQuery": 1,
    "resultsPerPage": 100,
    "customDataFunction": """async ({ input, $, request, response, html }) => {
  return {
    pageTitle: $('title').text(),
  };
};""",
}

# Run the Actor and wait for it to finish
run = client.actor("apify/google-search-scraper").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
loader = client.dataset(run["defaultDatasetId"]).iterate_items()

In [None]:
temp = list()
results = list()

for i in loader:
  temp.append(i)

print(temp)


In [None]:
data = temp[0]['organicResults']


In [None]:
from datetime import datetime

In [None]:
#2023년 데이터
filtered_data = [item for item in data if 'date' in item and datetime.fromisoformat(item['date'][:-1]).year == 2023]

In [None]:
urls = []

for item in filtered_data:
  urls.append(item['url'])

###APIFY Web Contents Crawling

In [None]:
from langchain.utilities import ApifyWrapper
from langchain.document_loaders.base import Document
import os


# Initialize the ApifyClient with your API token
apify = ApifyWrapper()

startUrls = [{"url": url} for url in urls]

#print(startUrls)

# Prepare the Actor input
run_input = {
    "startUrls": startUrls,
    "includeUrlGlobs": [],
    "excludeUrlGlobs": [],
    "initialCookies": [],
    "proxyConfiguration": { "useApifyProxy": True },
    "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
}

# Run the Actor and wait for it to finish
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input=run_input,
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

### Data Source 2/2 - Pinecone Initialize

###Pinecone Initialize

In [None]:
#INIT PINECONE
# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
env = os.getenv("PINECONE_ENV")
# index name
index_name = os.getenv("INDEX_NAME")

pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=1536, #huggingface:768 openai:1536
        metric='cosine'
    )
    # wait for index to be initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pinecone.Index(index_name)



In [None]:
# view index stats
index.describe_index_stats()

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

#벡터 저장

In [None]:

from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")


In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

docs = loader.load()


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

# Embeddings
embeddings = OpenAIEmbeddings()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1500,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True,
)



In [None]:
from langchain.vectorstores import Pinecone

documents = text_splitter.split_documents(docs)

# Create embeddings and store in vectordb
docsearch = Pinecone.from_documents(documents, embeddings, index_name=index_name)

In [None]:
index_name = os.getenv("INDEX_NAME")

# Define retriever
import pinecone

index = pinecone.Index(index_name)
vectorstore = Pinecone(index, embeddings.embed_query, "text")

# Define retriever
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4})

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [None]:
model = ChatOpenAI(model='gpt-3.5-turbo-16k')

In [None]:
template = """
SYSTEM
You are an expert researcher and writer, tasked with answering any question.
Generate a comprehensive and informative, yet concise answer of 250 words or less for the given question based solely on the provided search results (URL and content).
You must only use information from the provided search results. Use an unbiased and journalistic tone. Combine search results together into a coherent answer.
Do not repeat text. Cite search results using [${{number}}] notation. Only cite the most relevant results that answer the question accurately.
Place these citations at the end of the sentence or paragraph that reference them - do not put them all at the end.
If different results refer to different entities within the same name, write separate answers for each entity.
If you want to cite multiple results for the same sentence, format it as `[${{number1}}] [${{number2}}]`.
However, you should NEVER do this with the same number - if you want to cite `number1` multiple times for a sentence, only do `[${{number1}}]` not `[${{number1}}] [${{number1}}]`
You should use bullet points in your answer for readability. Put citations where they apply rather than putting them all at the end.
If there is nothing in the context relevant to the question at hand, just say "Hmm, I'm not sure." Don't try to make up an answer.
Anything between the following `context` html blocks is retrieved from a knowledge bank, not part of the conversation with the user.
You must answer in Korean.

<context>
    {context}
<context/>

HUMAN
{question}
  """
prompt = ChatPromptTemplate.from_template(template)

In [None]:
retrieval_chain = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | model
      | StrOutputParser()
  )

In [None]:
retrieval_chain.invoke("I want to know Gartner's predictions for the semiconductor industry.")