In [38]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, Chroma
from langchain_openai import OpenAIEmbeddings

In [39]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
from sentence_transformers import SentenceTransformer

In [41]:
import os
import glob
from pathlib import Path

In [42]:
from tqdm import tqdm

In [43]:
data_path = Path(r"C:\Users\Shaaf\Desktop\Data Science\Practice Projects\PDF_Video Reader\PDF_Samples")

In [44]:
pdf_files = list(data_path.rglob("**.pdf"))

In [45]:
pdf_files

[WindowsPath('C:/Users/Shaaf/Desktop/Data Science/Practice Projects/PDF_Video Reader/PDF_Samples/A Handbook of statistical distributions Krishmoorthi.pdf'),
 WindowsPath('C:/Users/Shaaf/Desktop/Data Science/Practice Projects/PDF_Video Reader/PDF_Samples/Analysis_of_Time_Series_An_Introduction.pdf')]

In [46]:
documents = []
for pdf_file in tqdm(pdf_files,desc="Loading PDFs"):
    try:
        loader = PyPDFLoader(str(pdf_file))
        documents.extend(loader.load())
    except Exception as e:
        print(f"[ERROR] Failed to load PDF {pdf_file}: {e}")

Loading PDFs: 100%|██████████| 2/2 [00:29<00:00, 14.91s/it]


In [47]:
print(len(documents))
print(documents[1].page_content[:500])
print(documents[1].metadata)


639
Chapman & Hall/CRC
Taylor & Francis Group
6000 Broken Sound Parkway NW, Suite 300
Boca Raton, FL 33487-2742
© 2006 by Taylor & Francis Group, LLC 
Chapman & Hall/CRC is an imprint of Taylor & Francis Group, an Informa business
No claim to original U.S. Government works
Printed in the United States of America on acid-free paper
10 9 8 7 6 5 4 3 2 1
International Standard Book Number-10: 1-58488-635-8 (Hardcover)
International Standard Book Number-13: 978-1-58488-635-8 (Hardcover)
This book contai
{'producer': 'iText 1.4 (by lowagie.com)', 'creator': 'PyPDF', 'creationdate': '2006-09-19T12:32:54+04:00', 'moddate': '2006-09-19T12:52:37+04:00', 'source': 'C:\\Users\\Shaaf\\Desktop\\Data Science\\Practice Projects\\PDF_Video Reader\\PDF_Samples\\A Handbook of statistical distributions Krishmoorthi.pdf', 'total_pages': 346, 'page': 1, 'page_label': '2'}


In [48]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [49]:
chunked_documents= text_splitter.split_documents(documents=documents)

In [50]:
print(f"Original pages: {len(documents)}")
print(f"Chunks created: {len(chunked_documents)}")

print(chunked_documents[0].page_content[:500])
print(chunked_documents[0].metadata)


Original pages: 639
Chunks created: 1491
K. Krishnamoorthy
University of Louisiana at Lafayette
U.S.A.
Handbook of Statistical
Distributions with
Applications
Boca Raton   London   New York
© 2006 by Taylor & Francis Group, LLC
{'producer': 'iText 1.4 (by lowagie.com)', 'creator': 'PyPDF', 'creationdate': '2006-09-19T12:32:54+04:00', 'moddate': '2006-09-19T12:52:37+04:00', 'source': 'C:\\Users\\Shaaf\\Desktop\\Data Science\\Practice Projects\\PDF_Video Reader\\PDF_Samples\\A Handbook of statistical distributions Krishmoorthi.pdf', 'total_pages': 346, 'page': 0, 'page_label': '1'}


In [51]:
embeddings =SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", trust_remote_code=True)

In [52]:
from langchain.embeddings.base import Embeddings

class QwenEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(
            texts,
            normalize_embeddings=True,
            show_progress_bar=False
        ).tolist()

    def embed_query(self, text):
        return self.model.encode(
            text,
            normalize_embeddings=True
        ).tolist()


In [None]:
from sentence_transformers import SentenceTransformer

qwen_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
        trust_remote_code=True
)


embeddings = QwenEmbeddings(qwen_model)


In [54]:
vectorstore = FAISS.from_documents(
    documents=chunked_documents,
    embedding=embeddings)

In [55]:
vectorstore.save_local("faiss_qwen")

In [56]:
##loading the vector store
vectorstore = FAISS.load_local(
    "faiss_qwen",
    embeddings,
    allow_dangerous_deserialization=True
)

In [57]:
querry = "What is this document about?"

In [58]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS

qwen_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    trust_remote_code=True
)

embeddings = QwenEmbeddings(qwen_model)

vectorstore = FAISS.load_local(
    "faiss_qwen",
    embeddings,
    allow_dangerous_deserialization=True
)


In [59]:
query = "what is time series analysis?"

docs = vectorstore.similarity_search(
    query=query,
    k=5
)


In [60]:
docs[0].page_content

"deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \ncontrol, and will be considered in turn. \n(a) Description \nWhen presented with a time series, the first step in the analysis is usually to plot \nthe data and to obtain simple descriptive measures of the main properties of \nthe series as described in Chapter 2. For example, looking at Figure 1.3 it can \nbe seen that there is a regular seasonal effect, with sales 'high' in winter and \n'low' in summer It also looks as though annual sales are increasing (i e show"

In [61]:
for d in docs:
    print(d.page_content[:])

deterministic. But most time series are stochastic in that the future is only 
partly determined by past values, so that exact predictions are impossible and 
must be replaced by the idea that future values have a probability distribution 
which is conditioned by a knowledge of past values. 
1.3 OBJECTIVES OF TIME-SERIES ANALYSIS 
There are several possible objectives in analysing a time series. These 
objectives may be classified as description, explanation, prediction and 
control, and will be considered in turn. 
(a) Description 
When presented with a time series, the first step in the analysis is usually to plot 
the data and to obtain simple descriptive measures of the main properties of 
the series as described in Chapter 2. For example, looking at Figure 1.3 it can 
be seen that there is a regular seasonal effect, with sales 'high' in winter and 
'low' in summer It also looks as though annual sales are increasing (i e show
Objectives of time-series analysis 5 
in Figure 1.5. A t

In [62]:
context = "\n\n".join(
    [
        f"(Page {d.metadata.get('page','?')}) {d.page_content}"
        for d in docs
    ]
)

In [63]:
context

"(Page 14) deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \ncontrol, and will be considered in turn. \n(a) Description \nWhen presented with a time series, the first step in the analysis is usually to plot \nthe data and to obtain simple descriptive measures of the main properties of \nthe series as described in Chapter 2. For example, looking at Figure 1.3 it can \nbe seen that there is a regular seasonal effect, with sales 'high' in winter and \n'low' in summer It also looks as though annual sales are increasing (i e show\n\n(Page 14) Objectives of time-

In [64]:
prompt = f"""
You are a policy compliance expert.

Answer the question using ONLY the information provided below.
If the answer is not present in the text, respond with:
"Not specified in the document."

Context:
{context}

Question:
{query}
"""


In [65]:
from langchain_community.llms import Ollama

llm = Ollama(
    model="llama3.1:8b",
    temperature=0
)


In [67]:
response = llm.invoke(prompt)
print(response)

Not specified in the document.

However, based on the provided text, it appears that Time Series Analysis involves:

* Plotting data and obtaining descriptive measures
* Describing the main properties of the series (e.g. regular seasonal effects)
* Predicting future values from past observations
* Accounting for cyclic components at different frequencies through spectral analysis

But a clear definition or comprehensive explanation of what time series analysis is, is not explicitly stated in the provided text.


In [68]:
from sentence_transformers import CrossEncoder

In [69]:
reranker = CrossEncoder("BAAI/bge-reranker-base")

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [101]:
pairs = [(query, doc.page_content) for doc in docs]

In [102]:
pairs

[('what is time series analysis?',
  "deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \ncontrol, and will be considered in turn. \n(a) Description \nWhen presented with a time series, the first step in the analysis is usually to plot \nthe data and to obtain simple descriptive measures of the main properties of \nthe series as described in Chapter 2. For example, looking at Figure 1.3 it can \nbe seen that there is a regular seasonal effect, with sales 'high' in winter and \n'low' in summer It also looks as though annual sales are increasing (i e show"),
 (

In [103]:
scores = reranker.predict(pairs)

In [104]:
# 4. Sort by score
reranked_docs = sorted(
    zip(docs, scores),
    key=lambda x: x[1],
    reverse=True
)

In [105]:
reranked_docs

[(Document(id='79fe7641-1e48-4034-8275-479f9b379782', metadata={'producer': 'Adobe Acrobat Pro 11.0.20 Paper Capture Plug-in', 'creator': 'PScript5.dll Version 5.2', 'creationdate': '2005-08-18T01:21:09+03:00', 'author': 'dbsubasi', 'moddate': '2017-12-23T12:10:40+05:00', 'title': 'DjVu Document', 'source': 'C:\\Users\\Shaaf\\Desktop\\Data Science\\Practice Projects\\PDF_Video Reader\\PDF_Samples\\Analysis_of_Time_Series_An_Introduction.pdf', 'total_pages': 293, 'page': 14, 'page_label': '15'}, page_content="deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \

In [74]:
# 5. Select top 3
top_docs = [doc for doc, score in reranked_docs[:3]]

In [75]:
top_docs

[Document(id='79fe7641-1e48-4034-8275-479f9b379782', metadata={'producer': 'Adobe Acrobat Pro 11.0.20 Paper Capture Plug-in', 'creator': 'PScript5.dll Version 5.2', 'creationdate': '2005-08-18T01:21:09+03:00', 'author': 'dbsubasi', 'moddate': '2017-12-23T12:10:40+05:00', 'title': 'DjVu Document', 'source': 'C:\\Users\\Shaaf\\Desktop\\Data Science\\Practice Projects\\PDF_Video Reader\\PDF_Samples\\Analysis_of_Time_Series_An_Introduction.pdf', 'total_pages': 293, 'page': 14, 'page_label': '15'}, page_content="deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \n

In [107]:
scores = reranker.predict(pairs)

In [108]:
scores

array([0.9914096 , 0.9533738 , 0.90799993, 0.9897343 , 0.79404813],
      dtype=float32)

In [110]:
for i, (doc, score) in enumerate(zip(docs, scores)):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Score: {score:.4f}")
    print(doc.page_content[:])  # first 300 chars



--- Chunk 1 ---
Score: 0.9914
deterministic. But most time series are stochastic in that the future is only 
partly determined by past values, so that exact predictions are impossible and 
must be replaced by the idea that future values have a probability distribution 
which is conditioned by a knowledge of past values. 
1.3 OBJECTIVES OF TIME-SERIES ANALYSIS 
There are several possible objectives in analysing a time series. These 
objectives may be classified as description, explanation, prediction and 
control, and will be considered in turn. 
(a) Description 
When presented with a time series, the first step in the analysis is usually to plot 
the data and to obtain simple descriptive measures of the main properties of 
the series as described in Chapter 2. For example, looking at Figure 1.3 it can 
be seen that there is a regular seasonal effect, with sales 'high' in winter and 
'low' in summer It also looks as though annual sales are increasing (i e show

--- Chunk 2 ---
Score: 0

In [93]:
prompt = f"""
You are a policy compliance expert.

Answer the question using ONLY the information provided below.
If the answer is not present in the text, respond with:
"Not specified in the document."

Context:
{reranked_docs}

Question:
{query}
"""
response = llm.invoke(prompt)
print(response)

Not specified in the document.


In [119]:
page_contents = [doc.page_content for doc, score in reranked_docs]

In [120]:
page_contents

["deterministic. But most time series are stochastic in that the future is only \npartly determined by past values, so that exact predictions are impossible and \nmust be replaced by the idea that future values have a probability distribution \nwhich is conditioned by a knowledge of past values. \n1.3 OBJECTIVES OF TIME-SERIES ANALYSIS \nThere are several possible objectives in analysing a time series. These \nobjectives may be classified as description, explanation, prediction and \ncontrol, and will be considered in turn. \n(a) Description \nWhen presented with a time series, the first step in the analysis is usually to plot \nthe data and to obtain simple descriptive measures of the main properties of \nthe series as described in Chapter 2. For example, looking at Figure 1.3 it can \nbe seen that there is a regular seasonal effect, with sales 'high' in winter and \n'low' in summer It also looks as though annual sales are increasing (i e show",
 'when a variable does not have an inst

In [121]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, score in reranked_docs])

In [122]:
print(context_text)

deterministic. But most time series are stochastic in that the future is only 
partly determined by past values, so that exact predictions are impossible and 
must be replaced by the idea that future values have a probability distribution 
which is conditioned by a knowledge of past values. 
1.3 OBJECTIVES OF TIME-SERIES ANALYSIS 
There are several possible objectives in analysing a time series. These 
objectives may be classified as description, explanation, prediction and 
control, and will be considered in turn. 
(a) Description 
When presented with a time series, the first step in the analysis is usually to plot 
the data and to obtain simple descriptive measures of the main properties of 
the series as described in Chapter 2. For example, looking at Figure 1.3 it can 
be seen that there is a regular seasonal effect, with sales 'high' in winter and 
'low' in summer It also looks as though annual sales are increasing (i e show

---

when a variable does not have an instantaneous val

In [123]:
prompt = f"""
You are a policy compliance expert.

Answer the question using ONLY the information provided below.
If the answer is not present in the text, respond with:
"Not specified in the document."

Context:
{context_text}

Question:
{query}
"""
response = llm.invoke(prompt)
print(response)

Time-series analysis is described as a process that involves several possible objectives, including description, explanation, prediction, and control. It typically begins with plotting the data and obtaining simple descriptive measures to understand the main properties of the series.

More specifically, it is mentioned in Section 1.3 OBJECTIVES OF TIME-SERIES ANALYSIS that time-series analysis has four main objectives:

* Description: Plotting the data and obtaining simple descriptive measures.
* Explanation: Understanding why a particular pattern or trend exists in the data.
* Prediction: Using past observations to forecast future values.
* Control: Identifying opportunities for intervention to influence the behavior of the series.

Additionally, it is mentioned that time-series analysis takes into account the time order of the observations and considers how successive observations are dependent on each other.
