In [9]:
import pandas as pd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

# Step 1: Load the CSV
csv_file_path = "ix_nonnumeric_output.csv"  # Replace with your file path
data = pd.read_csv(csv_file_path)

# Create a dictionary of text-to-XBRL tags
xbrl_dict = dict(zip(data['Text'], data['Name']))

# Step 2: Load the HTML content
html_content = """
<html>
    <body>
        <h1>Total Revenue</h1>
        <h2>Net Profit</h2>
        <h3>Company Assets</h3>
        <h4>Shareholders' Equity</h4>
    </body>
</html>
"""

soup = BeautifulSoup(html_content, "html.parser")

# Step 3: Define a function to fuzzy match and wrap header tags
def wrap_with_ix_nonnumeric(soup, xbrl_dict, threshold=70):
    # Find all header tags (h1 to h6)
    header_tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
    print(header_tags)
    for tag in header_tags:
        header_text = tag.get_text().strip()
        for csv_text, xbrl_tag in xbrl_dict.items():
            similarity = fuzz.ratio(header_text.lower(), csv_text.lower())
            if similarity >= threshold:
                # Wrap the tag with <ix:nonnumeric>
                new_tag = soup.new_tag("ix:nonnumeric", name=xbrl_tag)
                new_tag.string = tag.string
                tag.replace_with(new_tag)
                break  # Stop after the first match
    
    return soup

# Step 4: Apply the function
updated_soup = wrap_with_ix_nonnumeric(soup, xbrl_dict)

# Step 5: Output the modified HTML
print(updated_soup.prettify())


[<h1>Total Revenue</h1>, <h2>Net Profit</h2>, <h3>Company Assets</h3>, <h4>Shareholders' Equity</h4>]


TypeError: BeautifulSoup.new_tag() got multiple values for argument 'name'

In [10]:
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import pandas as pd

# Sample CSV data
data = pd.DataFrame({
    'text': ["Total Revenue", "Net Profit", "Assets", "Liabilities"],
    'xbrl_tag': ["data_tata", "tagagh", "Assets", "Liabilities"]
})

# Create a dictionary of text-to-XBRL tags
xbrl_dict = dict(zip(data['text'], data['xbrl_tag']))

# Sample HTML content
html_content = """
<html>
    <body>
        <h1>Total Revenue</h1>
        <h2>Net Profit</h2>
        <h3>Company Assets</h3>
        <h4>Shareholders' Equity</h4>
    </body>
</html>
"""

soup = BeautifulSoup(html_content, "html.parser")

# Define a function to fuzzy match and wrap header tags
def wrap_with_ix_nonnumeric(soup, xbrl_dict, threshold=70):
    # Find all header tags (h1 to h6)
    header_tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
    
    for tag in header_tags:
        header_text = tag.get_text().strip()
        for csv_text, xbrl_tag in xbrl_dict.items():
            similarity = fuzz.ratio(header_text.lower(), csv_text.lower())
            if similarity >= threshold:
                # Create a new <ix:nonnumeric> tag
                new_tag = soup.new_tag("ix:nonnumeric", attrs={"name": xbrl_tag})
                
                # Insert the new tag before the current tag
                tag.insert_before(new_tag)
                
                # Move the original tag into the new tag
                new_tag.append(tag)
                
                break  # Stop after the first match
    
    return soup

# Apply the function
updated_soup = wrap_with_ix_nonnumeric(soup, xbrl_dict)

# Output the modified HTML
print(updated_soup.prettify())


<html>
 <body>
  <ix:nonnumeric name="data_tata">
   <h1>
    Total Revenue
   </h1>
  </ix:nonnumeric>
  <ix:nonnumeric name="tagagh">
   <h2>
    Net Profit
   </h2>
  </ix:nonnumeric>
  <h3>
   Company Assets
  </h3>
  <h4>
   Shareholders' Equity
  </h4>
 </body>
</html>



In [11]:
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import pandas as pd

# Sample CSV data
data = pd.DataFrame({
    'text': ["Total Revenue", "Net Profit", "Assets", "Liabilities"],
    'xbrl_tag': ["data_tata", "tagagh", "Assets", "Liabilities"]
})

# Create a dictionary of text-to-XBRL tags
xbrl_dict = dict(zip(data['text'], data['xbrl_tag']))

# Sample HTML content
html_content = """
<html>
    <body>
        <h1>Total Revenue</h1>
        <h2>Net Profit</h2>
        <h3>Company Assets</h3>
        <h4>Shareholders' Equity</h4>
        <strong>Net Income</strong>
        <b>Gross Profit</b>
        <div style="font-weight: bold;">Company Overview</div>
        <span style="font-weight: bold;">Revenue Streams</span>
    </body>
</html>
"""

soup = BeautifulSoup(html_content, "html.parser")

# Define a function to fuzzy match and wrap header tags
def wrap_with_ix_nonnumeric(soup, xbrl_dict, threshold=70):
    # Tags to check: <h1> to <h6>, <b>, <strong>, and bold-styled tags
    tags_to_check = ["h1", "h2", "h3", "h4", "h5", "h6", "b", "strong", "span", "div"]

    # Find all specified tags
    all_possible_headers = soup.find_all(tags_to_check)

    # Filter out header-like tags (i.e., <h1> to <h6>, <b>, <strong>, and bold-styled tags)
    header_like_tags = []
    for tag in all_possible_headers:
        # Include <b> and <strong> directly
        if tag.name in ["b", "strong"]:
            header_like_tags.append(tag)
        # Include tags styled as bold using CSS
        elif tag.has_attr("style") and "font-weight: bold" in tag["style"]:
            header_like_tags.append(tag)
        # Include all <h1> to <h6>
        elif tag.name.startswith("h"):
            header_like_tags.append(tag)
    
    # For each header-like tag, perform fuzzy matching and wrap it with an <ix:nonnumeric> tag
    for tag in header_like_tags:
        header_text = tag.get_text().strip()
        
        # Track the highest similarity score and corresponding XBRL tag
        best_match = None
        highest_similarity = 0
        
        for csv_text, xbrl_tag in xbrl_dict.items():
            similarity = fuzz.ratio(header_text.lower(), csv_text.lower())
            if similarity >= threshold and similarity > highest_similarity:
                highest_similarity = similarity
                best_match = xbrl_tag
        
        # If a best match is found, wrap the tag with <ix:nonnumeric>
        if best_match:
            # Create a new <ix:nonnumeric> tag
            new_tag = soup.new_tag("ix:nonnumeric", attrs={"name": best_match})
            
            # Insert the new tag before the current tag
            tag.insert_before(new_tag)
            
            # Move the original tag into the new tag
            new_tag.append(tag)
    
    return soup

# Apply the function
updated_soup = wrap_with_ix_nonnumeric(soup, xbrl_dict)

# Output the modified HTML
print(updated_soup.prettify())




<html>
 <body>
  <ix:nonnumeric name="data_tata">
   <h1>
    Total Revenue
   </h1>
  </ix:nonnumeric>
  <ix:nonnumeric name="tagagh">
   <h2>
    Net Profit
   </h2>
  </ix:nonnumeric>
  <h3>
   Company Assets
  </h3>
  <h4>
   Shareholders' Equity
  </h4>
  <strong>
   Net Income
  </strong>
  <b>
   Gross Profit
  </b>
  <div style="font-weight: bold;">
   Company Overview
  </div>
  <span style="font-weight: bold;">
   Revenue Streams
  </span>
 </body>
</html>

