# LLM CASE STUDY
# NAMES: SOURABH R KAKRANNAYA, Vallapuri Jagapathi
# SRN: PES1UG22AM164, PES1UG22AM183

## INGESTION AND PREPROCESSING

In [None]:
import os
import logging
import re
import pdfplumber
import pandas as pd
from tabula import read_pdf  # Library used to extract tables from PDFs

# Configure logging to display info-level messages
logging.basicConfig(level=logging.INFO)

# Directory to save extracted text and tables
output_dir = "/kaggle/working/"
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

def clean_text(text):
    """
    Cleans extracted text by:
    - Removing page numbers (e.g., "Page X")
    - Removing non-ASCII characters
    - Normalizing whitespace

    Args:
        text (str): Raw text extracted from PDF.

    Returns:
        str: Cleaned and formatted text.
    """
    text = re.sub(r"Page\s?\d+", "", text)         # Remove "Page X"
    text = re.sub(r"[^\x00-\x7F]+", " ", text)     # Remove non-ASCII characters
    text = re.sub(r"\s+", " ", text).strip()       # Normalize whitespace
    return text

def extract_text(pdf_path):
    """
    Extracts text content from a given PDF file.

    Args:
        pdf_path (str): Path to the input PDF file.

    Returns:
        str: Extracted text from the PDF with page numbers preserved.
    """
    full_text = []
    
    with pdfplumber.open(pdf_path) as pdf:  # Open PDF file
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text() or ""  # Extract text from page
            cleaned_text = clean_text(page_text)  # Clean extracted text
            full_text.append(f"Page {page_num}:\n{cleaned_text}\n")  # Preserve page numbers
    
    return "\n".join(full_text)  # Return extracted text as a single string

def extract_tables(pdf_path):
    """
    Extracts tables from a given PDF file and saves them as CSV files.

    Args:
        pdf_path (str): Path to the input PDF file.

    Returns:
        list: A list of paths to the saved CSV files.
    """
    table_paths = []

    try:
        tables = read_pdf(pdf_path, pages="all", multiple_tables=True)  # Extract tables from all pages

        if not tables:
            logging.warning(f"No tables found in {pdf_path}")  # Log warning if no tables are found

        for i, table in enumerate(tables):
            # Generate output CSV file path
            output_csv_path = os.path.join(output_dir, f"{os.path.basename(pdf_path).replace('.pdf', '')}_table_{i + 1}.csv")
            table.to_csv(output_csv_path, index=False)  # Save table as CSV
            table_paths.append(output_csv_path)  # Store CSV path

    except Exception as e:
        logging.error(f"Error extracting tables from {pdf_path}: {e}")  # Log error if table extraction fails

    return table_paths  # Return list of extracted table paths

def process_pdf(pdf_path):
    """
    Main function to extract both text and tables from a PDF and save the results.

    Args:
        pdf_path (str): Path to the input PDF file.
    """
    logging.info(f"Processing: {pdf_path}")  # Log start of processing

    # Extract and save text
    extracted_text = extract_text(pdf_path)
    text_output_path = os.path.join(output_dir, f"{os.path.basename(pdf_path).replace('.pdf', '')}.txt")
    
    with open(text_output_path, "w", encoding="utf-8") as text_file:
        text_file.write(extracted_text)  # Save extracted text to file
    
    logging.info(f"Text saved to: {text_output_path}")  # Log saved text file

    # Extract and save tables
    table_paths = extract_tables(pdf_path)
    logging.info(f"Extracted {len(table_paths)} tables from: {pdf_path}")  # Log table count

    if not table_paths:
        logging.warning(f"No tables were extracted from {pdf_path}")  # Log warning if no tables are extracted

# Path to the input PDF file
pdf_path = "/kaggle/input/combined/combined_document_10.pdf"

# Run the PDF processing pipeline
process_pdf(pdf_path)


## RAG CONSTRUCTION:

In [7]:
import os
import logging
import pandas as pd
import faiss
import torch
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from huggingface_hub import login

login(token="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")



logging.basicConfig(level=logging.INFO)

# Define paths
output_dir = "/kaggle/working/"
text_file_path = os.path.join(output_dir, "combined_document_10.txt")

# 1. Load extracted text
with open(text_file_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()

# 2. Load extracted table data from CSVs and convert to text
table_texts = []
for filename in os.listdir(output_dir):
    if filename.startswith("combined_document_10_table_") and filename.endswith(".csv"):
        csv_path = os.path.join(output_dir, filename)
        try:
            df = pd.read_csv(csv_path)
            table_text = f"Table: {filename}\n" + df.to_string(index=False) + "\n"
            table_texts.append(table_text)
        except Exception as e:
            logging.error(f"Error reading {csv_path}: {e}")

# Combine extracted text and table data
full_text = extracted_text + "\n\n" + "\n\n".join(table_texts)

# 3. Initialize HuggingFace Embeddings (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})

# 4. Split the text and tables into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
text_chunks = text_splitter.split_text(full_text)

# 5. Create FAISS index from combined text & tables
faiss_index_path = os.path.join(output_dir, "faiss_index")
vector_store = FAISS.from_texts(text_chunks, embedding_model)
vector_store.save_local(faiss_index_path)


vector_store = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)

# 7. Load LLM (LLaMA-2 or similar)
pipe = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-Instruct-v0.3",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",  
)

# Wrap the pipeline for LangChain
llm = HuggingFacePipeline(pipeline=pipe)


# 8. Build Retrieval-Augmented Generation (RAG) Pipeline
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever())

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda
  llm = HuggingFacePipeline(pipeline=pipe)


## TRYING OUT QUERIES:

In [18]:
queries = [
    # 1. Apple's R&D Spending (2018 vs. 2017)
    "How much did Apple spend on Research and Development in fiscal year 2018, and how did it change compared to 2017?",

    # 2. iPhone Sales (Units) in 2018 vs. 2017
    "What were the total iPhone sales figures (in units) for Apple in 2018, and how did this compare to the previous year?",

    # 3. Microsoft Share Repurchase (2016)
    "How many shares did Microsoft repurchase in fiscal year 2016, and what was the total amount spent?",

    # 4. Apple's Net Sales in Americas (2018)
    "What was Apple's net sales figure for the Americas region in 2018, and what percentage of total net sales did this represent?",

    # 5. Microsoft's LinkedIn Acquisition Date
    "When did Microsoft acquire LinkedIn Corporation according to the quarterly information?",

    # 6. Microsoft's Dividend per Share (September 2015)
    "What was the dividend per share declared by Microsoft in September 2015?",

    # 7. Factors for iPad Sales Increase (2018 vs. 2017)
    "What factors contributed to the increase in iPad net sales during 2018 compared to 2017?",

    # 8. Apple's Services Segment Contribution (2018)
    "How much did Apple's Services segment contribute to total net sales in 2018, and what was the year-over-year growth percentage?",

    # 9. Components of Microsoft's 'Other Income (Expense), Net' (2018)
    "What were the main components of Microsoft's 'Other Income (Expense), Net' for fiscal year 2018?",

    # 10. Apple's Gross Margin Projection (Q1 2019)
    "What was Apple's gross margin percentage range anticipated for the first quarter of 2019?"
]

for q in queries:
    response = qa_chain.run(q)
    ans = response.find("Helpful Answer")
    print(response[ans:])
    print("-" * 80)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: In fiscal year 2018, Apple spent $11,988 million on
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: The total iPhone sales figures (in units) for Apple in 2018 were 2
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: Microsoft repurchased 148 million shares in fiscal year 2016, and
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: In 2018, Apple's net sales for the Americas region were $1
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: Microsoft acquired LinkedIn Corporation on December 8, 2016.

Full Answer
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: The dividend per share declared by Microsoft in September 2015 was $0.3
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: The increase in iPad net sales during 2018 compared to 2017 was
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: In 2018, Apple's Services segment contributed $37,190
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Helpful Answer: The main components of Microsoft's 'Other Income (Expense), Net' for fiscal year
--------------------------------------------------------------------------------
Helpful Answer: The anticipated gross margin percentage range for the first quarter of 2019 is between 3
--------------------------------------------------------------------------------
