In [100]:
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
import fitz
from llama_index.core.node_parser import SentenceSplitter
import torch
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient  
from qdrant_client.models import VectorParams, Distance
from transformers import pipeline
from huggingface_hub import InferenceClient










In [112]:
# Load environment variables
load_dotenv()
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACE_API_KEY")

In [None]:
import os
import fitz  # PyMuPDF

# Define the file paths
file_paths = [
    os.path.join( "information", "Investment Management.pdf"),
    os.path.join("information", "Journal of Finance.pdf")
]

# Extract text from both PDFs
entire_text = ""

for file_path in file_paths:
    if os.path.exists(file_path):
        with fitz.open(file_path) as doc:
            entire_text += "\n\n".join(page.get_text() for page in doc)
    else:
        print(f"File not found: {file_path}")  # Debugging statement

# Print only the first 1000 characters to verify extraction
print(entire_text[:1000])


JWPR026-Fabozzi
c01
June 22, 2008
6:54
CHAPTER 1
Portfolio Selection
FRANK J. FABOZZI, PhD, CFA, CPA
Professor in the Practice of Finance, Yale School of Management
HARRY M. MARKOWITZ, PhD
Consultant
FRANCIS GUPTA, PhD
Director, Research, Dow Jones Indexes
Some Basic Concepts
4
Utility Function and Indifference Curves
4
The Set of Efﬁcient Portfolios
and the Optimal Portfolio
4
Risky Assets versus Risk-Free Assets
4
Measuring a Portfolio’s Expected Return
5
Measuring Single-Period Portfolio Return
5
The Expected Return of a Portfolio of Risky
Assets
5
Measuring Portfolio Risk
6
Variance and Standard Deviation
as a Measure of Risk
6
Measuring the Risk of a Portfolio Comprised
of More than Two Assets
8
Portfolio Diversiﬁcation
8
Portfolio Risk and Correlation
9
The Effect of the Correlation of Asset Returns on
Portfolio Risk
9
Choosing a Portfolio of Risky Assets
9
Constructing Efﬁcient Portfolios
10
Feasible and Efﬁcient Portfolios
10
Choosing the Optimal Portfolio in the Efﬁcient Set
1

In [114]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_text(entire_text)

llamaindex_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=20)
llamaindex_text_chunks = llamaindex_splitter.split_text(entire_text)

In [115]:
# Load embedding model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "BAAI/bge-small-en-v1.5"
embedding_model = SentenceTransformer(model_name, device=device)
embeddings = embedding_model.encode(text_chunks, show_progress_bar=True)

Batches: 100%|██████████| 7/7 [00:31<00:00,  4.43s/it]


In [116]:
# Set up Qdrant vector database
client = QdrantClient("http://localhost:6333")
collection_name = "qa_index"

# Check if the collection already exists
existing_collections = [c.name for c in client.get_collections().collections]

if collection_name not in existing_collections:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )
    print(f"Collection '{collection_name}' created successfully.")
else:
    print(f"Collection '{collection_name}' already exists.")


Collection 'qa_index' already exists.


In [117]:
# Upload text chunks to Qdrant
ids = list(range(len(text_chunks)))
payload = [{"source": FILE_PATH, "content": text} for text in text_chunks]
client.upload_collection(
    collection_name=collection_name,
    vectors=embeddings,
    payload=payload,
    ids=ids,
    batch_size=256,
)


In [118]:

# Search function
def search(text: str, top_k: int):
    query_embedding = embedding_model.encode(text).tolist()
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=None,  
        limit=top_k
    )
    return search_result

In [None]:
while True:
    question = input("\nEnter your question (or type 'exit' to stop): ")
    
    if question.lower() == "exit":
        print("Goodbye!")
        break

    results = search(question, top_k=5)
    references = [obj.payload["content"] for obj in results]
    context = "\n\n".join(references)


  search_result = client.search(


In [129]:

# Set up Hugging Face inference client
hf_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.1", token=HUGGINGFACE_API_KEY)

# Full prompt formatting
input_text = f"""You are an assistant for giving the objectives of tasks listed in a document. 
Answer the question only according to the given context.

If the question cannot be answered using the context, simply say 'I don't know.' Do not make stuff up.

Context:
{context}

Question: {question}

Answer:"""

In [130]:
response = hf_client.text_generation(input_text, max_new_tokens=200)

print(response)

 A portfolio is a combination of different assets, such as stocks, bonds, and real estate, that are held together to achieve a specific investment objective, such as maximizing expected returns while minimizing risk.
