In [None]:
import os
import pandas as pd
import fitz
from pptx import Presentation
import requests
import io


In [None]:

# Function to download file from Google Drive
def download_file_from_google_drive(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure request was successful
    return io.BytesIO(response.content)

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_content):
    text_content = []
    pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
    for page_num in range(pdf_doc.page_count):
        page = pdf_doc[page_num]
        text_content.append({
            "file_type": "PDF",
            "page_or_slide": page_num + 1,
            "content": page.get_text()
        })
    pdf_doc.close()
    return text_content

# Function to extract text from a PowerPoint file
def extract_text_from_pptx(pptx_content):
    text_content = []
    presentation = Presentation(pptx_content)
    for slide_num, slide in enumerate(presentation.slides):
        slide_text = "\n".join(shape.text for shape in slide.shapes if hasattr(shape, "text"))
        text_content.append({
            "file_type": "PPTX",
            "page_or_slide": slide_num + 1,
            "content": slide_text
        })
    return text_content

# Function to process files
def process_files(file_urls):
    extracted_data = []
    for url in file_urls:
        if url.endswith(".pdf"):
            pdf_content = download_file_from_google_drive(url)
            extracted_data.extend(extract_text_from_pdf(pdf_content))
        elif url.endswith(".pptx"):
            pptx_content = download_file_from_google_drive(url)
            extracted_data.extend(extract_text_from_pptx(pptx_content))
    return extracted_data



In [None]:
# Define file URLs
file_urls = [
    "https://drive.google.com/uc?export=download&id=1umzTCsbBmuFx4xz9DSMI82oq21tHhbKL",
    "https://drive.google.com/uc?export=download&id=13oqVt9LYdESPS8XNYFLhSLZMkZk52JXG",
    "https://drive.google.com/uc?export=download&id=1Wib-VAY4TU-jwVykIu_Oi-6hpv0G-w7-"
]

# Create directory if it doesn't exist
output_dir = "artifacts/data_ingestion"
os.makedirs(output_dir, exist_ok=True)

# Process files and save to CSV in the specified directory
extracted_data = process_files(file_urls)
df = pd.DataFrame(extracted_data)
output_path = os.path.join(output_dir, "extracted_data.csv")
df.to_csv(output_path, index=False)

print(f"Data extraction complete. Saved to {output_path}")