# Financial Insights Generator


## Installing dependencies / packages


In [1]:
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install sentence_transformers
!pip install llama_index
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-langchain
!pip install -U sec-edgar-downloader
!pip install Flask==3.0.0
!pip install flask-ngrok
!pip install pyngrok==7.1.2

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m276.5/290.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00

## Logging into huggingface to access the Llama model

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt




In [4]:
from sec_edgar_downloader import Downloader    # Required to download the SEC 10-K filings for a company

In [5]:
system_prompt = """
You are an insights generation system.
Your goal is to generate output based on instructions and context provided
"""
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")  # System prompt given to the model based on the syntax pattern given in the documentation of Llama2 model.

## Getting the pre-trained Llama2 LLM



In [7]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},  # The temperature value can be adjusted to adjust the softmax probability associated with the generation of the next word(or token). In layman's terms, it adjust how 'creatively' the output is generated.
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",   # The tokenizer and the model might need additional permission to be accessed. The permission can be applied through the ID which was used to log into huggingface earlier.
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True} #Performing quantization by representing bias in 8 bits
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")) # generating embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

  service_context=ServiceContext.from_defaults(


## Importing libraries needed to run the flask server on ngrok

In [10]:
from pyngrok import ngrok
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template, request
ngrok_key = ##### Enter your ngrok token here ######
port = 5000
dl = Downloader("Student", "bhardwajmeher01@gmail.com")

## Downloading thr SEC 10-K filings

In [12]:
import os
import shutil
def getting_data_and_storing_it(code):  # This function not only downloads the filings, but also refactors their location so that they can be used by the LLM
  dl.get("10-K", code, after="1995-01-01")
  root_dir = f"/content/sec-edgar-filings/{code}/10-K/" #Set pattern in which the downloaded filings appear
  dest_dir = "/content/data/"
  root_contents = os.listdir(root_dir)
  if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
  for item in root_contents:
    item_path = os.path.join(root_dir, item)
    if os.path.isdir(item_path):
        folder_contents = os.listdir(item_path)  # If it's a directory, it would list its contents
        for folder_item in folder_contents: # Going over all the item in the folder
            if folder_item.endswith('.txt'):  # Checking if the item is a txt file, as the downloader downloads the files in .txt format
              source_path = os.path.join(item_path, folder_item)
              dest_path = os.path.join(dest_dir, folder_item)
              shutil.move(source_path, dest_path)  # Moving the txt file to the destination folder



## Setting up the flask server and setting the routes

In [20]:
ngrok.set_auth_token(ngrok_key)
ngrok.connect(port).public_url
app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def index():
  return render_template('index.html')  # The default templates folder may have to be changed if the file is located in a different folder.

@app.route("/result", methods=['POST'])
def result():
  selected_option = request.form['companies']
  if selected_option == 'google':
    code = 'GOOG'
  elif selected_option == 'microsoft':
    code = 'MSFT'
  else:
    code = 'AAPL'
  getting_data_and_storing_it(code)
  documents = SimpleDirectoryReader(f"/content/data/").load_data()
  index=VectorStoreIndex.from_documents(documents,service_context=service_context) # indexing the documents based on which the query engine will be trained.
  query_engine=index.as_query_engine()
  response1 = query_engine.query("Examine the company's cash flow statement from the latest 10K. What insights can be learnt regarding cash flow management? write very briefly in numeric terms.")
  response2 = query_engine.query("Extract key financial ratios such as ROE, ROA, and current ratio of the Company from their most recent 10K filing, along with the net income and profitability margins, and their general impact. Write very briefly in a paragraph.")
  response3 = query_engine.query("Identify significant risks faced by the company as outlined in their 10K filings. write in less than 5 lines.")
  response4 = query_engine.query("Identify strategic initiatives and future outlook provided by the management of the company in their 10K filing. Write in less than 5 lines.")

  results = dict()
  results[0] = selected_option
  results[1] = response1
  results[2] = response2
  results[3] = response3
  results[4] = response4
  return render_template('result.html', response = results)
if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://af6e-34-91-140-129.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [03/May/2024 18:14:37] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/May/2024 18:14:38] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
