In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from nltk.tokenize import sent_tokenize
import nltk
from google.colab import drive
import re

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/HTML_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/HTML_files


In [3]:
nltk.download('punkt_tab') # Download the required data for Punkt sentence tokenizer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Colors for each entity type
colours = {
    "ALGEBRA": "#FFD700",
    "APPLIED MATHEMATICS": "#FFF7E6",
    "CALCULUS AND ANALYSIS": "#E6FFE6",
    "DETERMINANTS": "#F7E6FF",
    "DISCRETE MATHEMATICS": "#FFFFE6",
    "FOUNDATIONS OF MATHEMATICS": "#E6E6FF",
    "GEOMETRY": "#FFE6F7",
    "LIE ALGEBRA": "#E6FFFF",
    "LINEAR ALGEBRA": "#FFD9B3",
    "LINEAR INDEPENDENCE": "#D9FFD9",
    "LINEAR SYSTEMS OF EQUATIONS": "#D9E6FF",
    "LOC": "#FFFFCC",
    "MATRICES": "#CCFFFF",
    "NUMBER THEORY": "#FFCCCC",
    "ORG": "#E0CCFF",
    "PERMANENTS": "#FFEBCC",
    "PROBABILITY AND STATISTICS": "#CCFFEB",
    "RECREATIONAL MATHEMATICS": "#FFEBEB",
    "TOPOLOGY": "#D9D9D9",
    "O": "#FFFFFF"  # Default for no entity
}

In [5]:
# model_name = "HeathStar/distilbert-NER-Math-finetuned"
model_name = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
# Load the model
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [6]:
text_piece = "A linear equation of two real variables geometrically forms a line in the plane. Likewise, a linear equation of three real variables geometrically forms a plane in 3-space. Although much harder to visualize, a linear equation in four variables geometrically forms a hyperplane in 4-space. Higher dimensional analogues likewise exist."

In [7]:
sentences = sent_tokenize(text_piece)

In [8]:
def gets_tokens_tags(sentence):
  inputs = tokenizer(sentence, return_tensors="pt")
  outputs = model(**inputs)
  logits = outputs.logits
  predicted_indices = torch.argmax(logits, dim=2)
  tags = [model.config.id2label[idx.item()] for idx in predicted_indices[0]]
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  return tokens, tags

In [9]:
def gets_tag_main_names(tags_object):
  tag_main_names = []
  for tag in tags_object:
    if tag == 'O':
      tag_name = 'O'
    else:
      tags_split = tag.split('-')
      tag_name = ' '.join(tags_split[1:])
    tag_main_names.append(tag_name)
  return tag_main_names

In [10]:
def gets_tag_colours(tags_object):
  tag_colours = []
  for tag in tags_object:
    tag_colour = colours.get(tag, colours["O"])
    tag_colours.append(tag_colour)
  return tag_colours

In [11]:
def formatted_word_producer(tokens, main_tags, tag_colours):
  token_output, tag_output, colour_output = [], [], []
  for token, tag, colour in zip(tokens, main_tags, tag_colours):
    if token not in tokenizer.all_special_tokens:
      if token.startswith("##"):
        token_output[-1] += token[2:]
        continue
      else:
        token_output.append(token)
        tag_output.append(tag)
        colour_output.append(colour)
  assert len(token_output) == len(tag_output) == len(colour_output), 'Error'
  return token_output, tag_output, colour_output

In [12]:
def html_content_producer(token_output, tag_output, colour_output):
  _html_content = ""
  my_list = []
  for i, (token, tag, colour) in enumerate(zip(token_output, tag_output, colour_output)):
    if tag != 'O':
      my_list.append(f'<span style="background-color: {colour}" title="{tag}">{token}</span>')
    else:
      my_list.append(token)

  html_string = " ".join(my_list)

  # Regex cleanup
  # 1. Remove extra spaces before punctuation
  html_string = re.sub(r"\s+([.,;:?!])", r"\1", html_string)
  # 2. Ensure a single space after punctuation
  html_string = re.sub(r"([.,;:?!])(\S)", r"\1 \2", html_string)

  _html_content += html_string + ' '
  return _html_content

In [13]:
# Create HTML with inline styles
html_content = """
<html>
<head>
    <style>
        body { font-family: Arial, sans-serif; line-height: 1.6; }
        span { padding: 2px 4px; margin: 2px; border-radius: 3px; }
    </style>
</head>
<body>
<p>
"""

In [14]:
for sentence in sentences:
  tokens, tags = gets_tokens_tags(sentence)
  main_tag_names = gets_tag_main_names(tags)
  tag_colours = gets_tag_colours(main_tag_names)
  token_output, tag_output, colour_output = formatted_word_producer(tokens, main_tag_names, tag_colours)
  html_content += html_content_producer(token_output, tag_output, colour_output)

In [15]:
# Close the HTML content
html_content += """
</p>
</body>
</html>
"""

In [16]:
# Write the HTML content to a file
with open("output.html", "w") as f:
    f.write(html_content)

print("HTML file created: output.html")

HTML file created: output.html
