In [4]:
!pip install openai faiss-gpu sentence-transformers pandas




In [5]:
!pip install --upgrade openai



In [10]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import json

# Load structured financial dataset
df = pd.read_csv("structured_10k.csv")

# Load pre-trained embeddings model for financial text
model = SentenceTransformer("FinLang/finance-embeddings-investopedia")

# Combine headers and content into a single list for embeddings
columns = df.columns.tolist()
content_list = [f"{col}: {df[col].iloc[0]}" for col in columns]  # Use the first row's content

# Generate embeddings
embeddings = model.encode(content_list)
embedding_array = np.array(embeddings)

# Initialize FAISS index
dimension = embedding_array.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)

# Add embeddings to the FAISS index
faiss_index.add(embedding_array)

# Save column names for reverse mapping
section_mapping = {i: col for i, col in enumerate(columns)}

# Save the FAISS index and mapping for reuse
faiss.write_index(faiss_index, "finance_10k_index.faiss")
with open("section_mapping.json", "w") as f:
    json.dump(section_mapping, f)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
!pip install --upgrade --upgrade-strategy eager "regex" "charset-normalizer<4" "idna" "urllib3<3" "certifi" "requests" "anyio<5" "distro<2" "sniffio" "h11<0.15" "httpcore==1.*"  "annotated-types" "typing-extensions<5" "pydantic-core==2.27.1" "pydantic<3" "jiter<1" "tqdm" "colorama" "openai" "tiktoken" "httpx<0.28"



Collecting regex
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting anyio<5
  Downloading anyio-4.7.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pydantic<3
  Downloading pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting httpx<0.28
  Downloading httpx-0.27.2-py3

In [2]:
from openai import OpenAI
# Initialize the client
client = OpenAI(
    api_key=""
)



In [12]:
def rag_pipeline_with_gpt(query, faiss_index, section_mapping, df, model, top_k=3):
    """
    Retrieval-Augmented Generation pipeline using FAISS and OpenAI GPT.

    :param query: The user query.
    :param faiss_index: The FAISS index for document retrieval.
    :param section_mapping: Mapping of FAISS indices to section names.
    :param df: DataFrame containing structured financial data.
    :param model: Sentence transformer model for embedding the query.
    :param top_k: Number of top results to retrieve.
    :return: GPT-generated response.
    """
    # Step 1: Encode the query
    query_embedding = model.encode([query])

    # Step 2: Search FAISS index
    distances, indices = faiss_index.search(np.array(query_embedding), k=top_k)

    # Step 3: Retrieve relevant sections
    context = ""
    for idx in indices[0]:
        section_name = section_mapping[str(idx)]
        section_content = df[section_name].iloc[0]
        context += f"Section: {section_name}\n{section_content}\n\n"

    # Step 4: Generate a response using GPT
    prompt = f"""Below is a user query paired with retrieved context from financial documents.
    Write a detailed, fact-based response that answers the query.

    ### Query:
    {query}

    ### Context:
    {context}

    ### Response:"""
    response = client.chat.completions.create(
    model="gpt-4o",  # Model ID (ensure you have access)
    messages=[
        {"role": "system", "content": "You are a financial analyst assistant."},
        {"role": "user", "content": prompt},
    ],)

    return response


In [13]:
import faiss
import json
import pandas as pd
# Load FAISS index and section mapping
faiss_index = faiss.read_index("finance_10k_index.faiss")
with open("section_mapping.json", "r") as f:
    section_mapping = json.load(f)
# Load structured financial dataset
df = pd.read_csv("structured_10k.csv")

# Query example
query = "What are the risk factors mentioned in the report?"

# Generate response
response = rag_pipeline_with_gpt(query, faiss_index, section_mapping, df, model)




In [14]:
print("Generated Response:")
print(response)

Generated Response:
ChatCompletion(id='chatcmpl-AbUt4JpVm47uJ51DyPJRCfgvZMtCb', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The report outlines several risk factors that could materially and adversely impact the company's business, reputation, operational results, financial condition, and stock price. Here are the key risk factors mentioned in the context:\n\n1. **Interest Rate Risk**: The company is exposed to fluctuations in U.S. interest rates, which can affect both its investment portfolio and its term debt. An increase in interest rates could lead to a decrease in the fair value of the company's investment portfolio, as well as an uptick in interest expenses related to its term debt. The report notes a hypothetical scenario where a 100 basis point rise in interest rates could cause a decline in the portfolio's fair value by $2,755 million in 2024 and $3,089 million in 2023, and increase the annual interest expense by $139 mi

In [None]:
!pip show openai



Name: openai
Version: 1.57.0
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: Apache-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


In [15]:
# Extract and print just the content
generated_content = response.choices[0].message.content
print("Generated Content:", generated_content)


Generated Content: The report outlines several risk factors that could materially and adversely impact the company's business, reputation, operational results, financial condition, and stock price. Here are the key risk factors mentioned in the context:

1. **Interest Rate Risk**: The company is exposed to fluctuations in U.S. interest rates, which can affect both its investment portfolio and its term debt. An increase in interest rates could lead to a decrease in the fair value of the company's investment portfolio, as well as an uptick in interest expenses related to its term debt. The report notes a hypothetical scenario where a 100 basis point rise in interest rates could cause a decline in the portfolio's fair value by $2,755 million in 2024 and $3,089 million in 2023, and increase the annual interest expense by $139 million in 2024 and $194 million in 2023.

2. **Foreign Exchange Rate Risk**: The company is also susceptible to risks from changes in foreign exchange rates, primari

In [18]:

# Query example
query = "What region contributes most to international sales?"

# Generate response
response = rag_pipeline_with_gpt(query, faiss_index, section_mapping, df, model)
generated_content = response.choices[0].message.content
print("Generated Content:", generated_content)



Generated Content: Based on the provided context from the company's financial documents, the company manages its business primarily on a geographic basis with reportable segments including the Americas, Europe, Greater China, Japan, and the Rest of Asia Pacific. While the context does not specify which of these regions contributes the most to international sales, it highlights the segmentation of markets that typically cover international operations.

For a more definitive answer, one would need specific financial data regarding sales distribution among these segments. However, historically for many large multinational companies in the tech industry, the Americas (particularly North America) often make significant contributions to overall sales, but in terms of international regions, Greater China and Europe are usually substantial contributors as well.

To gain a precise understanding of which region contributes most to international sales for the company, it would be advisable to ref

In [19]:
query ="Where is outsourcing located currently? iPhone Net Sales?"
# Generate response
response = rag_pipeline_with_gpt(query, faiss_index, section_mapping, df, model)
generated_content = response.choices[0].message.content
print("Generated Content:", generated_content)


Object `Sales` not found.
Generated Content: Based on the information provided in the context, Apple Inc. manages its business primarily on a geographic basis and has identified several reportable segments: the Americas, Europe, Greater China, Japan, and Rest of Asia Pacific. Each of these segments provides similar hardware and software products and services, and they are managed separately to align with the location of the company's customers and distribution partners as well as the unique market dynamics of each region. 

To determine which region contributes most to Apple's international sales, we need to look at the sales data across these segments. However, the context provided does not include specific sales figures for each geographic segment. Typically, Apple's Greater China region, comprising China mainland, Hong Kong, and Taiwan, has been one of the significant contributors to its international sales owing to its large consumer base and growing middle class. Europe also plays