In [2]:
import os
import logging
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from google.cloud import documentai_v1beta3 as documentai
from qdrant_client import QdrantClient, models
from settings import env_settings
from dotenv import load_dotenv

load_dotenv()


logger = logging.getLogger(__name__)

QDRANT_URL = env_settings.QDRANT_URL
QDRANT_API_KEY = env_settings.QDRANT_API_KEY
OPENAI_API_KEY = env_settings.OPENAI_API_KEY
GCP_PROJECT_ID = env_settings.GCP_PROJECT_ID
GCP_LOCATION = env_settings.GCP_LOCATION
GCP_PROCESSOR_ID = env_settings.GCP_PROCESSOR_ID
GCP_PROCESSOR_VERSION = env_settings.GCP_PROCESSOR_VERSION

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = env_settings.GOOGLE_APPLICATION_CREDENTIALS


class EthanRAG:
    _qdrant_client: QdrantClient = None

    def __init__(self):
        if EthanRAG._qdrant_client is None:
            EthanRAG._qdrant_client = self.connect_qdrant()

    @staticmethod
    def connect_qdrant() -> QdrantClient:
        return QdrantClient(
            api_key=QDRANT_API_KEY,
            url=QDRANT_URL,
        )

    @staticmethod
    async def extract_text_from_pdf(file_path: str) -> str:
        try:
            logger.info(f"Processing with Google Document AI: {file_path}")
            client = documentai.DocumentProcessorServiceClient()
            name = client.processor_version_path(
                GCP_PROJECT_ID, GCP_LOCATION, GCP_PROCESSOR_ID, GCP_PROCESSOR_VERSION
            )
            with open(file_path, "rb") as pdf_file:
                raw_document = documentai.RawDocument(
                    content=pdf_file.read(), mime_type="application/pdf"
                )
            request = documentai.ProcessRequest(
                name=name,
                raw_document=raw_document,
                process_options=documentai.ProcessOptions(
                    ocr_config=documentai.OcrConfig(
                        enable_native_pdf_parsing=True)
                ),
            )
            result = client.process_document(request=request)
            document = result.document
            extracted_text = document.text
            return extracted_text
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {e}")
            raise e

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ethan_rag = EthanRAG()

text = await ethan_rag.extract_text_from_pdf(
    file_path="K:\\EthanAI\\ethan-ai-rag\\uploads\\bala_monthly_report_dec_24-1-5.pdf",
)

In [9]:
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import Document

In [10]:
documents = [Document(text=text, metadata={"filename": "bala_monthly_report_dec_24-1-5.pdf"})]

In [6]:
node_parser = SentenceWindowNodeParser.from_defaults(window_size=3, window_metadata_key="window")

In [11]:
nodes = node_parser.get_nodes_from_documents(documents)

In [40]:
final_response = """
Final Plan to Assess the Impact of Current Tariffs on US Companies
Steps Completed:
Understand the Objective: Clarified the user's question about the impact of tariffs from the Trump administration on U.S. companies.
Research Current Tariffs: Identified tariffs still in effect, including those on solar panels, steel, aluminum, and Chinese goods.
Identify Affected Industries: Recognized key industries impacted by these tariffs, such as technology, automotive, manufacturing, solar energy, and consumer electronics.
Analyze Financial Reports: Reviewed financial metrics of major companies in affected industries, focusing on revenue, profit margins, and cost structures.
Remaining Steps:
Summarize Findings: Compile a comprehensive summary of the impact of tariffs on U.S. companies, highlighting key insights from the financial analysis and industry research.
Final Summary:
The tariffs imposed during the Trump administration, which remain in effect, continue to impact several key industries in the United States. These include the technology, automotive, manufacturing, solar energy, and consumer electronics sectors.
Technology Industry: Companies like Apple have shown resilience with strong profit margins despite increased costs from tariffs on Chinese electronics.
Automotive Industry: Tesla faces challenges with earnings and free cash flow, likely due to tariffs on steel and aluminum, which increase production costs.
Manufacturing and Solar Energy: Companies in these sectors, such as Ford and SunPower, are also affected by tariffs, impacting their cost structures and competitiveness.
Overall, while some companies have managed to maintain growth and profitability, others face significant financial pressures due to increased costs and supply chain disruptions. The ongoing tariffs continue to shape the strategic decisions and financial health of U.S. companies across these industries.
This analysis provides a clear understanding of the current economic landscape for U.S. companies affected by tariffs, offering insights into their financial performance and strategic challenges.
"""

In [41]:
summary_prompt = f"""
You are a podcast host summarizing a recent analysis. Create a concise, engaging summary (2-3 sentences) 
of the following content for a podcast audience. Use a friendly and conversational tone, 
and avoid technical jargon unless necessary also add some anecdotes or examples to make it funny and relatable: 

{final_response}

Summary:
"""

In [42]:
from langchain_openai import ChatOpenAI
from settings import env_settings

In [43]:
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=env_settings.OPENAI_API_KEY
)

In [44]:
summary_response = await llm.ainvoke(summary_prompt)
summary_text = summary_response.content

In [45]:
from gtts import gTTS
from google.cloud import texttospeech

In [46]:
client = texttospeech.TextToSpeechClient()
synthesis_input = texttospeech.SynthesisInput(text=summary_text)
voice = texttospeech.VoiceSelectionParams(
    language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL, name="en-US-Chirp3-HD-Charon",
)
audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3
)
response = client.synthesize_speech(
    input=synthesis_input, voice=voice, audio_config=audio_config
)
with open("output.mp3", "wb") as out:
    out.write(response.audio_content)
    print('Audio content written to file "output.mp3"')

Audio content written to file "output.mp3"
