# Generate HTML document

In [11]:
import pandas as pd
import os

# Sample Excel file path
excel_path = "/home/fantoni/patent-sentence-classification/results/claim_simplification/WO-2020058819-A1_B6078-1706_gpt-4o.xlsx"
#excel_path = "/home/fantoni/patent-sentence-classification/results/claim_simplification/WO-2019021161-A1_F16D65-12_gpt-4o.xlsx"
#excel_path = "/home/fantoni/patent-sentence-classification/results/claim_simplification/WO-2019243958-A1_F16D55-288_gpt-4o.xlsx"
#excel_path = "/home/fantoni/patent-sentence-classification/results/claim_simplification/IT-201900008253-A1_B65G1-023_gpt-4o.xlsx"
#excel_path = "/home/fantoni/patent-sentence-classification/results/claim_simplification/US8695121B2_A42B3_gpt-4o.xlsx"

# Set file name
filename = os.path.splitext(os.path.basename(excel_path))[0]

# Get original text
txt_path = f"/home/fantoni/patent-sentence-classification/data/claim_simplification/{"_".join(filename.split('_')[:-1])}.txt"
with open(txt_path, "r") as file:
    original_claim = file.read()

# Read the Excel file
df = pd.read_excel(excel_path, dtype={"index": str})

# Create level of indentation
df['level'] = df['index'].apply(lambda x: len(str(x).split('.')))

# Define background colors for classes
class_colors = {
    'FUN': '#ffcccc',   # reddish
    'STR': '#cce5ff',   # blueish
    'MIX': '#ccffcc',   # greenish
    'OTH': '#e0e0e0'    # greyish
}

# Create color legend HTML
legend_html = """
<div class="section">
    <summary>Legend</summary>
    <ul style="list-style-type: none; padding-left: 0;">
        <li style="background-color:#ffcccc; padding:4px; margin:2px 0; display: inline-block; width: 100px;">FUN</li>
        <li style="background-color:#cce5ff; padding:4px; margin:2px 0; display: inline-block; width: 100px;">STR</li>
        <li style="background-color:#ccffcc; padding:4px; margin:2px 0; display: inline-block; width: 100px;">MIX</li>
        <li style="background-color:#e0e0e0; padding:4px; margin:2px 0; display: inline-block; width: 100px;">OTH</li>
    </ul>
</div>
"""

# Create html for original text
original_claim_html = "<ul>"
original_claim_html += f"<li>{original_claim}</li>"
original_claim_html += "</ul>"

# Create html for simplified  text
grouped = df.dropna(subset=["sentence"]).groupby("level") # Group by 'level' for collapsible sentence sections

sentence_html = ""
for level, group in grouped:
    indent = int(level) * 40
    sentence_html += f"<details style='margin-left:{indent}px'>"
    sentence_html += "<summary></summary><ul style='list-style-type:none;'>"

    # Track the last seen prefix at this level
    last_index = None
    for _, row in group.iterrows():
        
        current_index = row['index']

        # Add space if prefix changed
        if last_index is not None and current_index != last_index:
            sentence_html += "<hr style='border: none; border-top: 1px solid #888; margin: 10px 0;'>"
        last_index = current_index

        bg_color = class_colors.get(row['pred_class'], '#ffffff')
        sentence_html += (
            f"<li style='background-color:{bg_color}; padding:4px; margin:2px 0;'>"
            f"{row['sentence']}</li>"
        )

    sentence_html += "</ul></details>"

# Combine everything into final HTML
html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Claim Visualization {filename}</title>
    <style>
        .section {{
            margin-bottom: 20px;
        }}
        summary {{
            font-weight: bold;
            cursor: pointer;
        }}
        body {{
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 20px;
        }}
    </style>
</head>
<body>
    <div class="section">
        <details open>
            <summary>Original Text</summary>
            {original_claim_html}
        </details>
    </div>
    {legend_html}
    <div class="section">
        <details open>
            <summary>Simplified Text</summary>
            {sentence_html}
        </details>
    </div>
</body>
</html>
"""

# Save the HTML to a file
html_file_path = f"/home/fantoni/patent-sentence-classification/results/claim_simplification/{filename}.html"
with open(html_file_path, "w", encoding="utf-8") as f:
    f.write(html_content)

print(f"HTML saved to {html_file_path}")

HTML saved to /home/fantoni/patent-sentence-classification/results/claim_simplification/WO-2020058819-A1_B6078-1706_gpt-4o.html
