# Chat with PDF

In [8]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings

import cassio

In [2]:
from PyPDF2 import PdfReader

# Setup

## Providing the secrets

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
pdfreader = PdfReader('research_paper.pdf')

In [5]:
raw_text = ''
for i, page in enumerate(pdfreader.pages):
  content = page.extract_text()
  if content:
    raw_text += content

In [6]:
print(raw_text)

10466 IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE, VOL. 46, NO. 12, DECEMBER 2024
A Survey on Graph Neural Networks for Time Series:
Forecasting, Classiﬁcation, Imputation, and
Anomaly Detection
Ming Jin ,H u a nY e eK o h , Qingsong Wen , Daniele Zambon , Cesare Alippi , Fellow, IEEE ,
Geoffrey I. Webb , Fellow, IEEE , Irwin King , Fellow, IEEE , and Shirui Pan , Senior Member, IEEE
(Survey Paper)
Abstract —Time series are the primary data type used to record
dynamic system measurements and generated in great volume by
both physical sensors and online processes (virtual sensors). Time
series analytics is therefore crucial to unlocking the wealth of in-
formation implicit in available data. With the recent advancements
in graph neural networks (GNNs), there has been a surge in GNN-
based approaches for time series analysis. These approaches can
explicitly model inter-temporal and inter-variable relationships,
which traditional and other deep neural network-based meth

Connect to database

In [7]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

LangChain embedding and LLM objects for later

In [9]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

Create LangChain vector store

In [10]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name='qa_demo',
    session=None,
    keyspace=None,
)

In [11]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)

In [None]:
texts[:50]

['10466 IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE, VOL. 46, NO. 12, DECEMBER 2024\nA Survey on Graph Neural Networks for Time Series:\nForecasting, Classiﬁcation, Imputation, and\nAnomaly Detection\nMing Jin ,H u a nY e eK o h , Qingsong Wen , Daniele Zambon , Cesare Alippi , Fellow, IEEE ,\nGeoffrey I. Webb , Fellow, IEEE , Irwin King , Fellow, IEEE , and Shirui Pan , Senior Member, IEEE\n(Survey Paper)\nAbstract —Time series are the primary data type used to record\ndynamic system measurements and generated in great volume by\nboth physical sensors and online processes (virtual sensors). Time\nseries analytics is therefore crucial to unlocking the wealth of in-\nformation implicit in available data. With the recent advancements',
 'series analytics is therefore crucial to unlocking the wealth of in-\nformation implicit in available data. With the recent advancements\nin graph neural networks (GNNs), there has been a surge in GNN-\nbased approaches for time series

## Load dataset to vector store

In [13]:
astra_vector_store.add_texts(texts)

print(f"Inserted {len(texts)} headlines")

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 205 headlines


## Run QA Cycle

In [14]:
first_question = True
while True:
  if first_question:
    query_text = input("Ask a question or type 'quit' to exit: ").strip()
  else:
    query_text = input("What is your next question or type 'quit' to exit: ").strip()
  if query_text.lower() == 'quit':
    break
  if query_text == '':
    continue

  first_question = False
  print(f"Question: {query_text}")
  response = astra_vector_index.query(query_text, llm=llm).strip()
  print(f"Answer: {response}")

  print("First documents by relevance:")
  for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
    print(f"{score} \"{doc.page_content[:84]}...\"")

Question: What are GNNs
Answer: GNNs are graph neural networks, which are modern deep learning models used to process graph-structured data. They involve exchanging information across neighboring nodes and rely on the inter-variable dependencies represented by the graph edges. GNNs are defined in the spatial domain and involve transforming the input signal with learnable functions along the dimension of N.
First documents by relevance:
0.9215013677038083 "We introduce graph neural networks as modern deep learning
models to process graph-s..."
0.9214874918418028 "edges. Aware of the different nuances, we deﬁne GNNs in the
spatial domain, which in..."
0.9214850672915897 "We introduce graph neural networks as modern deep learning
models to process graph-s..."
0.921469936265626 "edges. Aware of the different nuances, we deﬁne GNNs in the
spatial domain, which in..."
Question: What are the limitations of GNNs
Answer: The limitations of GNNs include their lack of explicit modeling of spatial