In [50]:
import fitz  # PyMuPDF
from datetime import datetime
import re
import ipywidgets as widgets
from IPython.display import display

# Function to extract text sorted visually
def extract_text_sorted(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        blocks = page.get_text("blocks")  # Extract text blocks
        sorted_blocks = sorted(blocks, key=lambda b: (b[1], b[0]))  # Sort by y (top-to-bottom), then x
        for block in sorted_blocks:
            text = block[4]
            full_text += text + "\n"
    return full_text

# Normalize text to handle URLs spanning multiple lines
def normalize_text(text):
    lines = text.splitlines()
    normalized_lines = []
    current_url = ""
    for line in lines:
        if re.match(r"https?://", line.strip()):  # Start of a URL
            if current_url:
                normalized_lines.append(current_url)
            current_url = line.strip()
        elif current_url and line.strip():
            current_url += line.strip()
        else:
            if current_url:
                normalized_lines.append(current_url)
                current_url = ""
            normalized_lines.append(line)
    if current_url:
        normalized_lines.append(current_url)
    return "\n".join(normalized_lines)

# Function to get the current year and week
def get_year_and_week():
    now = datetime.now()
    year = now.strftime("%Y")
    week_of_year = now.strftime("%U")
    return year, week_of_year

# Function to process uploaded PDF and generate text output
def process_pdf(uploaded_file):
    full_text = extract_text_sorted(uploaded_file)
    input_text = normalize_text(full_text)

    # Extract required information using regular expressions
    campaign_name = re.search(r"Campaign:(.*)", input_text).group(1).strip()
    sent_on_date = re.search(r"Sent on :\s*(.*)", input_text).group(1).strip()
    emails_sent = re.search(r"(\d+)\s*emails sent", input_text).group(1).strip()
    delivered_percentage = re.search(r"Delivered (\d+\.\d+)%", input_text).group(1)
    delivered_contacts = re.search(r"(\d+) Contacts", input_text).group(1)
    unique_opens_percentage = re.search(r"Unique Opens (\d+\.\d+)%", input_text).group(1)
    unique_opens_contacts = re.findall(r"(\d+) Contacts", input_text)[2]
    unique_clicks_percentage = re.search(r"Unique Clicks (\d+\.\d+)%", input_text).group(1)
    unique_clicks_contacts = re.findall(r"(\d+) Contacts", input_text)[3]
    unopened_percentage = re.search(r"Unopened (\d+\.\d+)%", input_text).group(1)
    unopened_contacts = re.findall(r"(\d+) Contacts", input_text)[4]
    click_open_rate = re.search(r"Clicks / Open Rate (\d+\.\d+)%", input_text).group(1)
    total_reach = re.search(r"Total Reach\s*(\d+)", input_text).group(1)

    # Logic for "Unique Clicks" and "Total Clicks"
    def extract_by_offset(keyword, text, line_offset):
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        top_click_start = lines.index("TOP CLICKED LINKS")
        for i, line in enumerate(lines):
            if line.startswith(keyword):
                target_line_index = i + line_offset
                if 0 <= target_line_index < len(lines) and top_click_start > target_line_index:
                    return lines[target_line_index]
        return "N/A"

    total_clicks_contacts, total_clicks = extract_by_offset("Total Clicks", input_text, 6), extract_by_offset("Total Clicks", input_text, 3)

    opened_clicked_ratio = extract_by_offset("Opened / Clicked Ratio", input_text, 3)

    # Extracting TOP CLICKED LINKS
    def extract_top_clicked_links(text):
        pattern = r"(\d+ Contacts\s*\d+ Clicks\s*https?://[\w./?=+-]+(?:\S*))"
        matches = re.findall(pattern, text)
        return matches

    top_clicked_links = extract_top_clicked_links(input_text)

    # Prepare output content
    output_content = [
        f"Campaign Name: {campaign_name}",
        f"Sent on Date: {sent_on_date}",
        "\nREPORT SUMMARY",
        f"Number of Emails Sent: {emails_sent}",
        f"Percentage of Delivered: {delivered_percentage}% ({delivered_contacts} Contacts)",
        f"Percentage of Unique Opens: {unique_opens_percentage}% ({unique_opens_contacts} Contacts)",
        f"Percentage of Unique Clicks: {unique_clicks_percentage}% ({unique_clicks_contacts})",
        f"Percentage of Unopened: {unopened_percentage}% ({unopened_contacts} Contacts)",
        f"Click / Open Rate: {click_open_rate}%",
        f"Number of Total Reach: {total_reach}",
        "\nLINKS CLICKED REPORT",
        f"Number of Total Clicks: {total_clicks} (Contacts: {total_clicks_contacts})",
        f"Opened / Clicked Ratio: {opened_clicked_ratio}",
        "\nTOP CLICKED LINKS",
        *top_clicked_links
    ]

    # Write the output to a text file
    year, week = get_year_and_week()
    output_file = f"{year}-{week}-Marketing-Zoho_Campaign-{campaign_name}.txt"
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("\n".join(output_content))

    print(f"Summary successfully saved to {output_file}")
# UI for file upload
upload_widget = widgets.FileUpload(accept='.pdf', multiple=True)
generate_button = widgets.Button(description="Generate Files", button_style="success")

# Store uploaded files
uploaded_files = []

# Handle file uploads
def on_upload_change(change):
    global uploaded_files
    for file in upload_widget.value:
        uploaded_files.append(file)

# Handle file generation
def on_generate_click(button):
    for uploaded_file in uploaded_files:
        process_pdf(uploaded_file)
    print("All files processed.")

# Attach event handlers
upload_widget.observe(on_upload_change, names='value')
generate_button.on_click(on_generate_click)

# Display the UI
display(widgets.VBox([
    widgets.Label("Upload your PDF files:"),
    upload_widget,
    generate_button
]))


VBox(children=(Label(value='Upload your PDF files:'), FileUpload(value=(), accept='.pdf', description='Upload'…