In [None]:
#!pip install openai langchain langgraph langchain-openai langchain-core langchain-community langchain-text-splitters langchain-elasticsearch "google-cloud-storage<3.0.0" beautifulsoup4 firebase-admin

In [None]:
#Libraries
import os
import pandas as pd
import requests
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_elasticsearch import ElasticsearchStore


### Model LLM

In [None]:
#Read OPENAI API
with open('keys/api_openai.txt', 'r') as file:
    api_openai = file.read()
os.environ['OPENAI_API_KEY'] = api_openai

In [None]:
#llm
llm = ChatOpenAI(model= "gpt-4.1-2025-04-14", temperature=0)

In [None]:
# Embedding Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

### RAG

In [None]:
#Ninjas API
with open('keys/api_ninja.txt', 'r') as file:
    api_ninjas = file.read()

In [None]:
# Part 1: Ingesting

# List with companies + quarters + year
df = pd.read_csv('SP500_Dataset/sp500_companies.csv')
df_summarized = df.loc[:,['Symbol']]
# Quarters
year_quarters = [('2024', '1'), ('2024', '2'), ('2024', '3'), ('2024', '4'), ('2025', '1')]
# Add to the df
df_summarized = df_summarized.assign(key=1).merge(
    pd.DataFrame(year_quarters, columns=['Year', 'Quarter']).assign(key=1),
    on='key'
).drop('key', axis=1)
df_summarized.head(10)

Unnamed: 0,Symbol,Year,Quarter
0,AAPL,2024,1
1,AAPL,2024,2
2,AAPL,2024,3
3,AAPL,2024,4
4,AAPL,2025,1
5,NVDA,2024,1
6,NVDA,2024,2
7,NVDA,2024,3
8,NVDA,2024,4
9,NVDA,2025,1


In [None]:
# loop each row
count = 0
for index, row in df_summarized.iterrows():
  ticker = row['Symbol']
  year = row['Year']
  quarter = row['Quarter']
  file_name = '/content/sample_data/transcript_v2/' + ticker + '_' + str(year) + '_' + str(quarter) + '.txt'
  # call API
  api_url = 'https://api.api-ninjas.com/v1/earningstranscript?ticker={}&year={}&quarter={}'.format(ticker, year, quarter)
  response = requests.get(api_url, headers={'X-Api-Key': api_ninjas})
  # Transcripts
  if response.status_code == requests.codes.ok:
    transcript = response.json()
    try:
      transcript_txt = transcript["transcript"]
    except:
      print("Error in row:", file_name)
    with open(file_name, 'w') as f:
      f.write(transcript_txt)
      count+=1

Error in row: /content/sample_data/transcript_v2/BRK-B_2024_1.txt
Error in row: /content/sample_data/transcript_v2/BRK-B_2024_2.txt
Error in row: /content/sample_data/transcript_v2/BRK-B_2024_3.txt
Error in row: /content/sample_data/transcript_v2/BRK-B_2024_4.txt
Error in row: /content/sample_data/transcript_v2/BRK-B_2025_1.txt
Error in row: /content/sample_data/transcript_v2/GD_2024_4.txt
Error in row: /content/sample_data/transcript_v2/TDG_2024_4.txt
Error in row: /content/sample_data/transcript_v2/CEG_2024_4.txt
Error in row: /content/sample_data/transcript_v2/ADSK_2025_1.txt
Error in row: /content/sample_data/transcript_v2/LULU_2025_1.txt
Error in row: /content/sample_data/transcript_v2/KR_2025_1.txt
Error in row: /content/sample_data/transcript_v2/HES_2024_1.txt
Error in row: /content/sample_data/transcript_v2/HES_2024_2.txt
Error in row: /content/sample_data/transcript_v2/HES_2024_3.txt
Error in row: /content/sample_data/transcript_v2/HES_2024_4.txt
Error in row: /content/sample_

In [None]:
# Part 2: Chunking

#Chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1800, chunk_overlap=250)
#loop each file
for filename in os.listdir('/content/sample_data/transcript_v2/'):
  file_path = os.path.join('/content/sample_data/transcript_v2/', filename)
  if os.path.isfile(file_path):
    clean_file = file_path.replace("/content/sample_data/transcript_v2/", "").split('.')[0]
    variables = clean_file.split("_")
    #text loader
    loader = TextLoader(file_path)
    doc = loader.load()
    #generate chunks
    schunk = text_splitter.split_documents(doc)
    #add Metadata
    for docs in schunk:
      docs.metadata['company'] = variables[0]
      docs.metadata['quarter'] = variables[2]
      docs.metadata['year'] = variables[1]
    #ElasticSearch - had to interrup due to time processing 2.53 GB storage
    db = ElasticsearchStore.from_documents(
          documents=schunk,
          embedding=embeddings,
          es_url="<your_instance_url>",
          es_user="<user>",
          es_password="<password>",
          index_name="indx_project"
        )
    db.client.indices.refresh(index="indx_project")