<a href="https://colab.research.google.com/github/JoseArthurSoares/ProjetoFinal-PLN/blob/main/Vers%C3%A3oFinal_ProjetoPLN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install openai langchain_community pinecone-client
!pip install langchain_community
!pip install pinecone-client
!pip install openai
!pip install -U langchain-openai langchain-pinecone
!pip install transformers sentence-transformers
!pip install llama-stack
!pip install -U bitsandbytes
!pip install huggingface_hub
!pip install langchain
!pip install lark
!pip install lark lark-parser
!pip install --upgrade langchain
import os
import time
import requests
import pandas as pd
from xml.etree import ElementTree
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import DirectoryLoader, CSVLoader
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain_pinecone import PineconeVectorStore
from langchain import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.query_constructor.base import get_query_constructor_prompt, StructuredQueryOutputParser
from langchain_community.query_constructors.pinecone import PineconeTranslator
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from huggingface_hub import login



1 - Recuperar dados da API arXiv

In [26]:
# Function to get data from arXiv API
def get_data(query, max_results, max_retries):
    url = f"http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}"
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            root = ElementTree.fromstring(response.content)
            xml_namespace = "{http://www.w3.org/2005/Atom}"
            papers = [{'title': entry.find(f'{xml_namespace}title').text,
                       'summary': entry.find(f'{xml_namespace}summary').text,
                       'link': entry.find(f'{xml_namespace}id').text}
                      for entry in root.findall(f'{xml_namespace}entry')]
            return papers
        except requests.ConnectionError:
            time.sleep(2 ** attempt)

2 - Pré-processar os papers

In [27]:
# Preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if 3 <= len(token) <= 15]
    stop_words = set(stopwords.words('english') + list(string.punctuation))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

3 - Salvar os papers no formato CSV

In [28]:
# Save papers to CSV
def save_papers_to_csv(papers, filename):
    clean_papers = [{'title': paper['title'], 'link': paper['link'], 'summary': ' '.join(preprocess_text(paper['summary']))} for paper in papers]
    os.makedirs('data', exist_ok=True)
    df = pd.DataFrame(clean_papers, columns=['title', 'link', 'summary'])
    df.to_csv(filename, index=False)

4 - Inicializar o banco de dados Pinecone e criar um index.

In [29]:
# Initialize Pinecone and create index
def initialize_pinecone(api_key, index_name):
    pc = Pinecone(api_key=api_key)
    if index_name not in pc.list_indexes().names():
        pc.create_index(name=index_name, dimension=4096, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
    return pc.Index(index_name)

5 - Gerar os embeddings

In [30]:
# Generate embeddings
class EmbeddingPooling(nn.Module):
    def __init__(self, input_dimension, output_dimension):
        super().__init__()
        self.fc = nn.Linear(input_dimension, output_dimension)
    def forward(self, x):
        return self.fc(x)

def generate_embeddings(text, model, tokenizer, reducer, cache):
    if text in cache:
        return cache[text]
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.logits.mean(dim=1).squeeze()
        reduced_embeddings = reducer(embeddings)
    cache[text] = reduced_embeddings.numpy()
    return cache[text]

6 - Carregar os documentos e inserir no Pinecone

In [31]:
# Load documents and insert into Pinecone
def load_and_insert_docs(loader, pc_index, generate_embeddings, batch_size=100):
    docs = loader.load()
    batch = []
    for count, doc in enumerate(docs):
        content, metadata = doc.page_content, doc.metadata
        if not content or not metadata:
            continue
        embedding = generate_embeddings(content)
        doc_id = metadata.get('id', f"doc_{count}")
        batch.append({"id": doc_id, "values": embedding, "metadata": metadata})
        if len(batch) >= batch_size:
            pc_index.upsert(vectors=batch)
            batch = []
            torch.cuda.empty_cache()
    if batch:
        pc_index.upsert(vectors=batch)

7 - Criar Self-Querying Retriever

In [32]:
# Create Self-Querying Retriever
def create_retriever(pipe, vectorstore, metadata_field_info, examples):
    constructor_prompt = get_query_constructor_prompt("Summary of a research paper, along with keywords and a link", metadata_field_info, allowed_comparators=["$eq", "$ne", "$in", "$nin", "$exists"], examples=examples)
    query_constructor = constructor_prompt | pipe | StructuredQueryOutputParser.from_components()
    return SelfQueryRetriever(query_constructor=query_constructor, vectorstore=vectorstore, structured_query_translator=PineconeTranslator(), search_kwargs={'k': 10})


8 - Formatar os documentos

In [33]:
# Format documents
def format_docs(docs):
    return "\n\n".join(f"Title: {doc.metadata.get('Title', 'No title')}\nSummary: {doc.metadata.get('Summary', 'No summary')}\nLink: {doc.metadata.get('Link', 'No link')}\nMetadata: {doc.metadata}" for doc in docs)


9 - Função de recomendação de papers

In [34]:
# Recommend papers
def recommend_papers(query, retriever, rag_chain_with_source):
    if not query:
        return "Por favor, forneça uma consulta."
    context = retriever.invoke(query)
    if not context:
        return "Não consegui encontrar nenhum artigo que corresponda à sua consulta."
    formatted_context = format_docs(context)
    return rag_chain_with_source.invoke({"context": formatted_context, "question": query})


**EXECUÇÃO PRINCIPAL**

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

query = "quantum physics OR astrophysics"
papers = get_data(query, max_results=5, max_retries=5)
save_papers_to_csv(papers, 'data/arxiv_papers.csv')

login(token="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")
pc_index = initialize_pinecone(api_key="ba937f3f-37d7-4ef2-aff7-a76ef54586c0", index_name="quantum-astrophysics-listing")

model_id = "meta-llama/Llama-2-13b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
huggingface_pipeline = HuggingFacePipeline(pipeline=hf_pipeline)
reducer = EmbeddingPooling(32000, 1536)
embedding_cache = {}

loader = DirectoryLoader(path="/content/data", glob="*.csv", loader_cls=CSVLoader, show_progress=True)
load_and_insert_docs(loader, pc_index, lambda text: generate_embeddings(text, model, tokenizer, reducer, embedding_cache))

metadata_field_info = [
    AttributeInfo(name="Title", description="The title of the paper", type="string"),
    AttributeInfo(name="Summary", description="The summary of the paper", type="string"),
    AttributeInfo(name="Link", description="The link to the paper", type="string")]
examples = [
    ("Find papers about black holes.", {"query": "black holes", "filter": 'in("Keywords", ["black holes"])'}),
     ("Which papers discuss quantum gravity?", {"query": "quantum gravity", "filter": 'NO_FILTER'})]
vectorstore = PineconeVectorStore(index="quantum-astrophysics-listing", embedding=lambda text: generate_embeddings(text, model, tokenizer, reducer, embedding_cache), text_key="content", namespace="default_namespace")
retriever = create_retriever(huggingface_pipeline, vectorstore, metadata_field_info, examples)

prompt = ChatPromptTemplate.from_messages([('system', "Your goal is to recommend academic papers to users based on their query and the retrieved context. If a retrieved paper doesn't seem relevant, omit it from your response. If your context is empty or none of the retrieved papers are relevant, do not recommend papers, but instead tell the user you couldn't find any papers that match their query. Aim for three to five paper recommendations, as long as the papers are relevant. You cannot recommend more than five papers. Your recommendation should be relevant, original, and at least two to three sentences long.\n\nYOU CANNOT RECOMMEND A PAPER IF IT DOES NOT APPEAR IN YOUR CONTEXT.\n\n# TEMPLATE FOR OUTPUT\n- **Title of Paper**:\n    - Year of Publication:\n    - Summary:\n    - Link:\n    - (Your reasoning for recommending this paper)\n\nQuestion: {question}\nContext: {context}")])
chat_model = LLMChain(llm=huggingface_pipeline, prompt=prompt)
rag_chain_from_docs = RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"]))) | prompt | chat_model | StrOutputParser()
rag_chain_with_source = RunnableParallel({"context": retriever, "question": RunnablePassthrough()}).assign(answer=rag_chain_from_docs)

test_query = "Can you recommend any papers about Quantum Field Theory?"
recommendations = recommend_papers(test_query, retriever, rag_chain_with_source)
print(recommendations)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 976.33it/s]
