## Retrieve all PDF Links

In [1]:
import requests
from bs4 import BeautifulSoup

def get_pdf_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    pdf_links = []
    
    for a_tag in soup.find_all('a', href=True):
        if a_tag['href'].endswith('.pdf'):
            pdf_links.append(a_tag['href'])
    
    return pdf_links

url = "https://www.nasa.gov/nesc/knowledge-products/nesc-technical-bulletins/"
pdf_links = get_pdf_links(url)
pdf_links.remove('https://www.nasa.gov/wp-content/uploads/2023/10/tb-summary-100223.pdf')
print(pdf_links)

['https://www.nasa.gov/wp-content/uploads/2015/04/tb-23-06-091923.pdf', 'https://ntrs.nasa.gov/api/citations/20230010417/downloads/TB_23-05_080123%20FINAL.pdf', 'https://ntrs.nasa.gov/api/citations/20230010411/downloads/TB_23-04_r071923_NRB.pdf', 'https://ntrs.nasa.gov/api/citations/20230010357/downloads/TB_23-03v3.pdf', 'https://ntrs.nasa.gov/api/citations/20230001890/downloads/TB_23-02_Safety_031723.pdf', 'https://ntrs.nasa.gov/api/citations/20230001889/downloads/TB_23-01_Pyro%20Best%20Practices_031523.pdf', 'https://ntrs.nasa.gov/api/citations/20220013364/downloads/TB_22-08_Contaminant%20Reduction_083022-FINAL.pdf', 'https://ntrs.nasa.gov/api/citations/20220012296/downloads/TB_22-07_Helium%20Solubility_080922-Final.pdf', 'https://ntrs.nasa.gov/api/citations/20220011709/downloads/TB_22-06_SloshDynamics_080522.pdf', 'https://ntrs.nasa.gov/api/citations/20220011631/downloads/TB_22-05_Margin%20Reductions_080122v5.pdf', 'https://ntrs.nasa.gov/api/citations/20220010362/downloads/TB_22-04_

## Generate Documents

In [3]:
from langchain.document_loaders import PyPDFium2Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
import time

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
documents = []
for pdf_url in tqdm(pdf_links, desc="Loading PDFs"):
    retries = 3  # Number of retries
    for attempt in range(retries):
        try:
            pdf_loader = PyPDFium2Loader(pdf_url)
            pdf_chunks = pdf_loader.load_and_split(RecursiveCharacterTextSplitter(
                                                chunk_size=1000, chunk_overlap=0))
            # Extend documents with chunks and their source
            for chunk in pdf_chunks:
                documents.append({
                    'text': chunk.page_content,
                    'source': pdf_url
                })
            break  # If successful, break out of the retry loop
        
        except:  
            if attempt < retries - 1:  # Don't sleep after the last attempt
                time.sleep(5)  # Waits for 5 seconds if connection is aborted.
            else:
                print(f"Failed to fetch {pdf_url} after {retries} attempts.")


Loading PDFs: 100%|██████████| 52/52 [03:24<00:00,  3.93s/it]


## Process Embeddings and Push to Vector DB

In [5]:
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda' if torch.cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [6]:
embeddings = embed_model.embed_documents([doc['text'] for doc in documents[:2]])
print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 384.


In [7]:
import pinecone
import os

pinecone.init(
    api_key=os.environ['pinecone_apikey'],
    environment=os.environ['pinecone_env']
)

index_name = "nasa-rag"
# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=len(embeddings[0])
    )

In [8]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [9]:
def process_batch(batch):
    texts = [doc['text'] for doc in batch]
    embeds = embed_model.embed_documents(texts)
    ids = [str(i) for i in range(batch_start, batch_start + len(batch))]
    
    return ids, embeds

batch_size = 32

# Wrap the range with tqdm for progress bar
for batch_start in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
    batch = documents[batch_start:batch_start + batch_size]
    ids, embeds = process_batch(batch)
    
    index.upsert(vectors=zip(ids, embeds, batch))

index.describe_index_stats()


Processing batches: 100%|██████████| 30/30 [03:12<00:00,  6.42s/it]


{'dimension': 384,
 'index_fullness': 0.00832,
 'namespaces': {'': {'vector_count': 832}},
 'total_vector_count': 832}

## Create LLM+RAG Chain

In [10]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone

In [11]:
llm = OpenAI(openai_api_key=os.environ['openai_apikey'])

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

rag_model = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())



In [12]:
response = rag_model('Explain The HPV Method for UQ')
print(response['result'])

 The HPV Method for UQ is a method for modeling uncertainty in a structural dynamics system which combines a parametric variation of the HCB FI modal frequencies with a NPV method that randomly varies the HCB mass and stiffness matrices. The NPV component of the HPV method replaces the HCB matrices with an ensemble of random matrices based on RMT, which are real, symmetric, and possess the appropriate sign definiteness to represent structural mass, stiffness, or damping matrices. The HPV method uses uncertainty models for HCB components based on component modal test/analysis correlation results, and the NPV based dispersion of the HCB mass matrix is derived from the test self-orthogonality matrix.


In [13]:
llm('Explain The HPV Method for UQ')

'\n\nThe HPV Method is a statistical technique used for Uncertainty Quantification (UQ). It is a Monte Carlo simulation-based method which combines the concepts of sensitivity analysis and probability distribution of model input uncertainty. The HPV Method uses a combination of Latin hypercube sampling and Monte Carlo simulation to generate a large number of model input combinations. The output of each model run is then analyzed to estimate the probability density function of the model output. The HPV Method is used to identify and understand the sources of input uncertainty that have the greatest impact on the model output. It is also used to identify and quantify the range of possible output values for a given set of model inputs.'