In [1]:
import os

os.environ["OPENAI_API_KEY"] = "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"

## Product Classification Experiment 1 with prompt filter_prompt.txt: 

#### This noteook inputs the data from LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data

#### Outputs: Classified Insurance Product HTML files in poster_presentation/02_binary_products_v1 
#### Outputs: Report csv file LAB-COMPETITIVE-ANALYSIS/notebooks/poster_presentation/report_product_pages_v1.csv

In [4]:
"""
Filter_Products_Basic.ipynb

This notebook uses a simpler prompt (filter_prompt_v1.txt) to determine if an HTML page is a product page.
It creates a single CSV that includes all companies, and adds a 'url' column.
"""

import os
import shutil
import csv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

def load_prompt_from_file(prompt_file):
    """Load the prompt content from a text file."""
    with open(prompt_file, 'r', encoding='utf-8') as file:
        return file.read()

def filter_product_pages(
    input_folder,
    output_folder,
    chat_model,
    prompt_file,
    csv_file,
    base_domain="example.com"
):
    """
    Iterates over each company folder inside `input_folder`.
    For each HTML file, it uses an LLM to decide if the file is a product page.
    Copies product pages to 'output_folder/<company>' and writes results to one CSV.

    :param input_folder: Main folder containing subfolders for each company
    :param output_folder: Folder to store copied product pages
    :param chat_model: A LangChain ChatOpenAI model
    :param prompt_file: Path to the basic prompt file
    :param csv_file: Path to the consolidated CSV report
    :param base_domain: Domain used to build a mock URL (e.g., "example.com")
    """

    # Create the CSV writer
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        # Add a 'url' column
        csv_writer.writerow(['company_name', 'filename', 'url', 'product'])

        # Iterate over each subfolder (representing a company)
        for company_folder in os.listdir(input_folder):
            company_path = os.path.join(input_folder, company_folder)

            if os.path.isdir(company_path):
                print(f"Processing company folder: {company_folder}")
                company_output_folder = os.path.join(output_folder, company_folder)
                os.makedirs(company_output_folder, exist_ok=True)

                # Load the prompt content
                prompt_template = load_prompt_from_file(prompt_file)
                # Create the chat prompt
                chat_prompt = ChatPromptTemplate.from_messages([
                    HumanMessagePromptTemplate.from_template(prompt_template)
                ])

                # Check all HTML files in the current company's folder
                for filename in os.listdir(company_path):
                    if filename.endswith(".html"):
                        file_path = os.path.join(company_path, filename)

                        # Build a mock URL (customize to your real domain if known)
                        url = f"https://{company_folder}.{base_domain}/{filename}"

                        # Format the prompt
                        formatted_prompt = chat_prompt.format_messages(filename=filename)
                        # Get the model's response
                        response = chat_model(formatted_prompt).content.strip()

                        # Decide if it's a product page
                        is_product = "yes" in response.lower()

                        # Write result to CSV
                        csv_writer.writerow([
                            company_folder,
                            filename,
                            url,
                            "yes" if is_product else "no"
                        ])

                        # Copy file if it's a product
                        if is_product:
                            shutil.copy(file_path, os.path.join(company_output_folder, filename))
                            print(f"Copied product page to: {company_output_folder}/{filename}")

# -----------------------------------------------------------------------------
# MAIN EXECUTION
# -----------------------------------------------------------------------------

# Paths
INPUT_FOLDER = "/Users/umutekingezer/Desktop/NLP_lab/LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data"
OUTPUT_FOLDER = "02_binary_products_v1"
PROMPT_FILE = "filter_prompt_v1.txt"
CSV_FILE = "report_product_pages_v1.csv"

# Initialize the Chat Model (set your OpenAI API Key appropriately)
chat_model = ChatOpenAI(model="gpt-4o", temperature=0)

filter_product_pages(
    input_folder=INPUT_FOLDER,
    output_folder=OUTPUT_FOLDER,
    chat_model=chat_model,
    prompt_file=PROMPT_FILE,
    csv_file=CSV_FILE,
    base_domain="insurance-example.com"  # change to a real domain if you have one
)

print("Filtering with Basic Prompt complete!")
print(f"Report generated: {CSV_FILE}")


Processing company folder: generali
Copied product page to: 02_binary_products_v1/generali/privatkunden_gesundheit-freizeit_krankenhaustagegeld.html
Copied product page to: 02_binary_products_v1/generali/privatkunden_recht-haftung_tierhalterhaftpflichtversicherung.html
Copied product page to: 02_binary_products_v1/generali/privatkunden_fahrzeug-zuhause_hausratversicherung.html
Copied product page to: 02_binary_products_v1/generali/geschaeftskunden_gesundheit-betriebliche-vorsorge_direktversicherung.html
Copied product page to: 02_binary_products_v1/generali/privatkunden_vorsorge-finanzen_grundfaehigkeitsversicherung.html
Copied product page to: 02_binary_products_v1/generali/privatkunden_gesundheit-freizeit_krankentagegeld-zusatzversicherte.html
Copied product page to: 02_binary_products_v1/generali/geschaeftskunden_gesundheit-betriebliche-vorsorge_auslands-gruppenversicherung.html
Copied product page to: 02_binary_products_v1/generali/geschaeftskunden_gesundheit-betriebliche-vorsorge_

## Product Classification Experiment 2 with prompt filter_prompt_v2.txt: 

#### This noteook inputs the data from LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data

#### Outputs: Classified Insurance Product HTML files in poster_presentation/02_binary_products_v2 
#### Outputs: Report csv file LAB-COMPETITIVE-ANALYSIS/notebooks/poster_presentation/report_product_pages_v2.csv

In [5]:
"""
Filter_Products_Refined.ipynb

This notebook uses a more refined prompt (filter_prompt_v2.txt) that includes:
 - A system message providing context and instructions
 - A human message specifying the filename and task.

It outputs a single CSV for all companies (with 'url') and copies product pages to separate subfolders.
"""

import os
import shutil
import csv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

def load_prompt_from_file(prompt_file):
    """Load the entire prompt (system + user) from a text file."""
    with open(prompt_file, 'r', encoding='utf-8') as f:
        return f.read()

def parse_prompt_text(full_prompt_text):
    """
    Given the raw text from filter_prompt_v2.txt, split into [System] and [Human] sections.

    The file is structured with tokens:
        [System]
        ...system instructions...
        [Human]
        ...human instructions...
    """
    # Split on "[System]" and "[Human]"
    # First, ensure the text has both markers
    if "[System]" not in full_prompt_text or "[Human]" not in full_prompt_text:
        raise ValueError("Prompt file must contain '[System]' and '[Human]' sections.")

    # Split once at "[System]" -> discard the empty chunk before it
    _, after_system = full_prompt_text.split("[System]", 1)
    # Then split at "[Human]"
    system_part, human_part = after_system.split("[Human]", 1)

    system_part = system_part.strip()
    human_part = human_part.strip()

    return system_part, human_part

def filter_product_pages_refined(
    input_folder,
    output_folder,
    chat_model,
    prompt_file,
    csv_file,
    base_domain="example.com"
):
    """
    Iterates over each company folder inside `input_folder`.
    Uses a refined prompt approach (system + human messages).
    Writes a single CSV covering all companies with columns: company, filename, url, product.
    """
    # Load and parse the refined prompt text
    full_prompt_text = load_prompt_from_file(prompt_file)
    system_prompt_text, human_prompt_text = parse_prompt_text(full_prompt_text)

    # Prepare CSV writer
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['company_name', 'filename', 'url', 'product'])

        # Traverse each company folder
        for company_folder in os.listdir(input_folder):
            company_path = os.path.join(input_folder, company_folder)

            if os.path.isdir(company_path):
                print(f"Processing company folder: {company_folder}")
                company_output_folder = os.path.join(output_folder, company_folder)
                os.makedirs(company_output_folder, exist_ok=True)

                # Build chat prompt with system + human messages
                chat_prompt = ChatPromptTemplate.from_messages([
                    SystemMessagePromptTemplate.from_template(system_prompt_text),
                    HumanMessagePromptTemplate.from_template(human_prompt_text)
                ])

                # Check HTML files in the current company folder
                for filename in os.listdir(company_path):
                    if filename.endswith(".html"):
                        file_path = os.path.join(company_path, filename)

                        # Construct a hypothetical URL
                        url = f"https://{company_folder}.{base_domain}/{filename}"

                        # Format the messages
                        formatted_prompt = chat_prompt.format_messages(filename=filename)
                        response = chat_model(formatted_prompt).content.strip()

                        # Determine if it's a product page
                        is_product = "yes" in response.lower()

                        # Write to CSV
                        csv_writer.writerow([
                            company_folder,
                            filename,
                            url,
                            "yes" if is_product else "no"
                        ])

                        # Copy product file if is_product
                        if is_product:
                            shutil.copy(file_path, os.path.join(company_output_folder, filename))
                            print(f"Copied product page to: {company_output_folder}/{filename}")

# -----------------------------------------------------------------------------
# MAIN EXECUTION
# -----------------------------------------------------------------------------

# Paths
INPUT_FOLDER = "/Users/umutekingezer/Desktop/NLP_lab/LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data"
OUTPUT_FOLDER = "02_binary_products_v2"
PROMPT_FILE = "filter_prompt_v2.txt"
CSV_FILE = "report_product_pages_v2.csv"

# Initialize the Chat Model
chat_model = ChatOpenAI(model="gpt-4o", temperature=0)

filter_product_pages_refined(
    input_folder=INPUT_FOLDER,
    output_folder=OUTPUT_FOLDER,
    chat_model=chat_model,
    prompt_file=PROMPT_FILE,
    csv_file=CSV_FILE,
    base_domain="insurance-example.com"  # Adjust as needed
)

print("Filtering with Refined Prompt complete!")
print(f"Report generated: {CSV_FILE}")


Processing company folder: generali
Copied product page to: 02_binary_products_v2/generali/privatkunden_gesundheit-freizeit_krankenhaustagegeld.html
Copied product page to: 02_binary_products_v2/generali/privatkunden_recht-haftung_tierhalterhaftpflichtversicherung.html
Copied product page to: 02_binary_products_v2/generali/privatkunden_fahrzeug-zuhause_hausratversicherung.html
Copied product page to: 02_binary_products_v2/generali/geschaeftskunden_gesundheit-betriebliche-vorsorge_direktversicherung.html
Copied product page to: 02_binary_products_v2/generali/privatkunden_vorsorge-finanzen_grundfaehigkeitsversicherung.html
Copied product page to: 02_binary_products_v2/generali/privatkunden_gesundheit-freizeit_gesundheitsservices_kooperationen_mister-spex.html
Copied product page to: 02_binary_products_v2/generali/service-kontakt_schutzbrief-service.html
Copied product page to: 02_binary_products_v2/generali/privatkunden_gesundheit-freizeit_krankentagegeld-zusatzversicherte.html
Copied pr

### More refined:

In [None]:
# Cell 1: Filter_Products_Basic

import os
import shutil
import csv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

def load_prompt_from_file(prompt_file):
    """Load the prompt content from a text file."""
    with open(prompt_file, 'r', encoding='utf-8') as file:
        return file.read()

def filter_product_pages(
    input_folder,
    output_folder,
    chat_model,
    prompt_file,
    csv_file
):
    """
    Iterates over each company folder in `input_folder`.
    For each HTML file, uses an LLM to decide if the file is a product page.
    Copies product pages to 'output_folder/<company>' and writes results to a single CSV.
    """

    # Create or overwrite the CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        # No 'url' column, as requested
        csv_writer.writerow(['company_name', 'filename', 'product'])

        # Iterate over each subfolder (company) inside the input folder
        for company_folder in os.listdir(input_folder):
            company_path = os.path.join(input_folder, company_folder)
            if os.path.isdir(company_path):
                print(f"Processing company folder: {company_folder}")
                company_output_folder = os.path.join(output_folder, company_folder)
                os.makedirs(company_output_folder, exist_ok=True)

                # Load the prompt
                prompt_text = load_prompt_from_file(prompt_file)
                chat_prompt = ChatPromptTemplate.from_messages([
                    HumanMessagePromptTemplate.from_template(prompt_text)
                ])

                # Check all HTML files in the current company's folder
                for filename in os.listdir(company_path):
                    if filename.endswith(".html"):
                        file_path = os.path.join(company_path, filename)

                        # Format the prompt
                        formatted_prompt = chat_prompt.format_messages(filename=filename)
                        # Get the LLM response
                        response = chat_model(formatted_prompt).content.strip()
                        # Decide if it's a product page
                        is_product = "yes" in response.lower()

                        # Write to CSV
                        csv_writer.writerow([
                            company_folder,
                            filename,
                            "yes" if is_product else "no"
                        ])

                        # Copy file if it's a product
                        if is_product:
                            shutil.copy(file_path, os.path.join(company_output_folder, filename))
                            print(f"Copied product page to: {company_output_folder}/{filename}")

# ---------------------------------------------------------------------------
# MAIN EXECUTION
# ---------------------------------------------------------------------------
INPUT_FOLDER = "/Users/umutekingezer/Desktop/NLP_lab/LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data"
OUTPUT_FOLDER = "02_binary_products_v1"
PROMPT_FILE = "filter_prompt_v1.txt"
CSV_FILE = "report_product_pages_v1.csv"

# Initialize the Chat Model with model="gpt-4o" as requested
chat_model = ChatOpenAI(model="gpt-4o", temperature=0)

filter_product_pages(
    input_folder=INPUT_FOLDER,
    output_folder=OUTPUT_FOLDER,
    chat_model=chat_model,
    prompt_file=PROMPT_FILE,
    csv_file=CSV_FILE
)

print("Filtering with Basic Prompt complete!")
print(f"All results saved in {CSV_FILE}")


In [None]:
# Cell 2: Filter_Products_Refined

import os
import shutil
import csv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

def load_prompt_from_file(prompt_file):
    """Load the entire prompt text (system + user) from a file."""
    with open(prompt_file, 'r', encoding='utf-8') as f:
        return f.read()

def parse_prompt_text(full_prompt_text):
    """
    Given the raw text from 'filter_prompt_v2.txt', split into [System] and [Human] sections.
    Must contain both markers: '[System]' and '[Human]'.
    """
    if "[System]" not in full_prompt_text or "[Human]" not in full_prompt_text:
        raise ValueError("Prompt file must contain '[System]' and '[Human]' sections.")

    _, after_system = full_prompt_text.split("[System]", 1)
    system_part, human_part = after_system.split("[Human]", 1)

    system_part = system_part.strip()
    human_part = human_part.strip()

    return system_part, human_part

def filter_product_pages_refined(
    input_folder,
    output_folder,
    chat_model,
    prompt_file,
    csv_file
):
    """
    Iterates over each company folder in `input_folder`.
    Uses a refined prompt approach (system + user messages).
    Writes a single CSV with columns: [company_name, filename, product].
    """

    # Load and parse the refined prompt text
    full_prompt_text = load_prompt_from_file(prompt_file)
    system_prompt_text, human_prompt_text = parse_prompt_text(full_prompt_text)

    # Create or overwrite the CSV file
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['company_name', 'filename', 'product'])

        # Traverse each company folder
        for company_folder in os.listdir(input_folder):
            company_path = os.path.join(input_folder, company_folder)
            if os.path.isdir(company_path):
                print(f"Processing company folder: {company_folder}")
                company_output_folder = os.path.join(output_folder, company_folder)
                os.makedirs(company_output_folder, exist_ok=True)

                # Create a chat prompt from system + human messages
                chat_prompt = ChatPromptTemplate.from_messages([
                    SystemMessagePromptTemplate.from_template(system_prompt_text),
                    HumanMessagePromptTemplate.from_template(human_prompt_text)
                ])

                # Process each HTML file
                for filename in os.listdir(company_path):
                    if filename.endswith(".html"):
                        file_path = os.path.join(company_path, filename)

                        # Format messages
                        formatted_prompt = chat_prompt.format_messages(filename=filename)
                        response = chat_model(formatted_prompt).content.strip()

                        # Determine if it's a product page
                        is_product = "yes" in response.lower()

                        # Write to CSV
                        csv_writer.writerow([company_folder, filename, "yes" if is_product else "no"])

                        # Copy if product
                        if is_product:
                            shutil.copy(file_path, os.path.join(company_output_folder, filename))
                            print(f"Copied product page to: {company_output_folder}/{filename}")

# ---------------------------------------------------------------------------
# MAIN EXECUTION
# ---------------------------------------------------------------------------
INPUT_FOLDER = "/Users/umutekingezer/Desktop/NLP_lab/LAB-COMPETITIVE-ANALYSIS/data/01_company_crawled_data"
OUTPUT_FOLDER = "02_binary_products_v2"
PROMPT_FILE = "filter_prompt_v2.txt"
CSV_FILE = "report_product_pages_v2.csv"

# Initialize Chat Model
chat_model = ChatOpenAI(model="gpt-4o", temperature=0)

filter_product_pages_refined(
    input_folder=INPUT_FOLDER,
    output_folder=OUTPUT_FOLDER,
    chat_model=chat_model,
    prompt_file=PROMPT_FILE,
    csv_file=CSV_FILE
)

print("Filtering with Refined Prompt complete!")
print(f"All results saved in {CSV_FILE}")
