In [1]:
# Imports and setup
! pip install intervaltree



In [2]:
# Imports and setup
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from google.colab import drive, userdata
import pickle
import random
import re
from collections import defaultdict
from intervaltree import Interval, IntervalTree
from collections import Counter
import json

In [3]:
# Mount Google Drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [4]:
# Utility Functions

# Define nested_dict function
def nested_dict():
  return defaultdict(nested_dict)

# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

# Split CamelCase to s e p a r a t e d text
def split_camel_case(text):
    # Use regex to insert a space before uppercase letters preceded by lowercase letters
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

In [5]:
# Tokenization and NER Setup
# Existing tags from DistilBERT
tag_checkpoint = "dslim/distilbert-NER"
tag_tokenizer = AutoTokenizer.from_pretrained(tag_checkpoint, do_lower_case=False)
tag_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)

tags = pipeline("ner", model=tag_model, tokenizer=tag_tokenizer)

Device set to use cpu


In [6]:
tag_tokenizer.is_fast

True

In [7]:
# Helper methods built from DistilBERT NER

def idx2string(text):
    # Converts the text into a list of words and returns a dictionary where keys are indices, and values are the words
    # Split on whitespace or punctuation, punctuation is used for tokens
    words = re.findall(r'\b\w+\b|[^\s\w]', text)
    my_dict = {i: word for i, word in enumerate(words)}
    return my_dict

def tokenidx2words(ner_results):
  # Maps token indices to the actual words based on the NER results
  my_dict = dict()
  for sub_dict in ner_results:
    word, index = sub_dict['word'], sub_dict['index']
    my_dict[index] = word
  return my_dict

def tokenidx2entity(ner_results):
  # Maps token indices to the corresponding entity types from the NER results
  my_dict = dict()
  for sub_dict in ner_results:
    entity, index = sub_dict['entity'], sub_dict['index']
    my_dict[index] = entity
  return my_dict

def idx2wordpos(text, idx2string):
  # Maps word indices to their start and end positions in the original text, allowing the exact location of words to be tracked
  my_dict = {}
  current_pos = 0
  for idx, word in idx2string.items():
    start = text.index(word, current_pos)
    end = start + len(word)
    my_dict[idx] = (start, end)
    current_pos = end
  return my_dict

def tokenidx2entityword(example2wordidx, ner_results):
  # Create an IntervalTree to map entity spans to token indices. Handles tokens overlapping or being split into subwords
  tree = IntervalTree(Interval(start, end, key) for key, (start, end) in example2wordidx.items())

  my_dict = dict()
  for sub_dict in ner_results:
    tag_start, tag_end = sub_dict['start'], sub_dict['end']
    tag_entity = sub_dict['entity']
    token_idx = sub_dict['index']

    overlapping_intervals = tree[tag_start : tag_end] # Intervals which contain the entitry recognised
    for interval in overlapping_intervals:
      my_dict[token_idx] = {'entity' : tag_entity, 'word_idx': interval.data}

  return my_dict

In [8]:
def idx2token_and_id(idx2string_dict, token_list, input_ids_list):
  my_token_dict = dict()
  token_index = 0 # start of tokens list
  # match word to the start of the token
  for key, value in idx2string_dict.items():
    reconstructed_word = ""
    start_token = None
    start_input_id = None

    # Loop through tokens to find the starting token
    while token_index < len(token_list):
      token = token_list[token_index]

      # strip '##' in subwords
      if token.startswith("##"):
        reconstructed_word += token[2:]  # Add subword without "##"
      else:
        if reconstructed_word:  # Reset if a new word starts
          reconstructed_word = ""
        reconstructed_word += token
        start_token = token
        start_input_id = input_ids_list[token_index]
      # Break if reconstructed
      if reconstructed_word == value:
        my_token_dict[key] = (start_token, start_input_id)
        token_index += 1  # Move to the next token
        break
      token_index += 1
  return my_token_dict

In [9]:
# Get naming convention from scraped data
topics = read_pickle("topics.pkl")

print(f"topic_dict dictionary length: {len(topics)}")
random_sample = random.sample(list(topics.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

topic_dict dictionary length: 10
Topology:
['Algebraic Topology', 'Bundles', 'Cohomology', 'General Topology', 'Knot Theory', 'Low-Dimensional Topology', 'Manifolds', 'Point-Set Topology', 'Spaces', 'Topological Invariants', 'Topological Operations', 'Topological Structures']
Recreational Mathematics:
['Cryptograms', 'Dissection', 'Folding', 'Games', 'Illusions', 'Magic Figures', 'Mathematical Art', 'Mathematical Humor', 'Mathematical Records', 'Mathematics in the Arts', 'Number Guessing', 'Numerology', 'Puzzles', 'Sports']
Foundations of Mathematics:
['A New Kind of Science', 'Axioms', 'Category Theory', 'Logic', 'Mathematical Problems', 'Point-Set Topology', 'Set Theory', 'Theorem Proving']


In [10]:
# Get naming convention from scraped data
sub_topics = read_pickle("sub_topics.pkl")

my_dict = dict()
for key, value in sub_topics.items():
  split_key = split_camel_case(key)
  my_dict[split_key] = value

sub_topics = my_dict

print(f"sub_topics dictionary length: {len(sub_topics)}")

for key, value in random_sample:
  print(f"{key}:\n{sub_topics[key]}")

sub_topics dictionary length: 10
Topology:
['Absolute Retract', 'Abstract Simplicial Complex', 'Acyclic Chain Complex', "Adams' Theorem", 'Algebraic Gadget', 'Algebraic K-Theory', 'Algebraic Topology', 'Analytic Torsion', 'Band', 'Bordism Group', 'Cellular Approximation Theorem', 'Cellular Map', 'Chain Contraction', 'Characteristic Class', 'Chern Class', 'Chern Number', 'Closed Star', 'Combinatorial Topology', 'Commutative Diagram', 'Contractibility', 'Contractible', 'Cup Product', 'CW-Approximation Theorem', 'CW-Complex', 'Deck Transformation', 'Deformation Retract', 'Euler Number', 'Extension Problem', 'Fiber Space', 'Five Lemma', 'Four Lemma', 'Free', 'Fundamental Class', 'Fundamental Group', 'Gadget', 'Geometric Realization', 'Gerbe', 'Graded Module', 'H-Space', 'Hereditarily Unicoherent Continuum', 'Homeomorphism Group', 'Homologous', 'Homology Class', 'Homotopic', 'Homotopy', 'Homotopy Axiom', 'Homotopy Class', 'Homotopy Equivalence', 'Homotopy Group', 'Homotopy Sphere', 'Homotop

In [11]:
# Merge Topics and Subtopics to one dictionary
for key, value in topics.items():
  if key in sub_topics and sub_topics[key] is not None:
    value.extend(sub_topics[key])
    # Convert to set and back to list to remove duplicates and sort
    new_value = list(set(value))
    new_value.sort()
    topics[key] = new_value

In [12]:
print(f"topics dictionary length: {len(topics)}")

for key, value in random_sample:
  print(f"{key}:\n{topics[key]}")

topics dictionary length: 10
Topology:
['0-Connected', '1-Connected', 'Absolute Retract', 'Abstract Manifold', 'Abstract Simplicial Complex', 'Abstract Topological Space', 'Abstract Vector Space', 'Acyclic Chain Complex', "Adams' Theorem", 'Affine Hull', 'Ahlfors-Bers Theorem', 'Aleksandrov-Čech Cohomology', "Alexander's Horned Sphere", 'Alexander-Spanier Cohomology', 'Algebraic Gadget', 'Algebraic K-Theory', 'Algebraic Manifold', 'Algebraic Topology', 'Almost Everywhere Convergence', 'Alternating Knots', 'Ambient Isotopy', 'Amphichiral Knots', 'Analytic Torsion', 'Anchor', 'Anosov Flow', 'Antipodal Map', "Antoine's Horned Sphere", "Antoine's Necklace", 'Approximation Problem', 'Approximation Property', 'Arc Component', 'Arcwise-Connected', 'Associated Fiber Bundle', 'Associated Vector Bundle', 'Atiyah-Singer Index Theorem', 'Atlas', 'Axiom A Diffeomorphism', 'Axiom A Flow', 'Baire Category Theorem', 'Baire Space', 'Banach Space', 'Band', 'Base Manifold', 'Base Space', 'Basepoint', 'Ba

In [13]:
# Merge Topics and Subtopics to one dictionary - continued
alg_2 = read_pickle("alg_2.pkl")
alg_3 = read_pickle("alg_3.pkl")

# merge alg_2 and alg_3 into 1 dictionary
alg_dict = dict()

for key, value in alg_2.items():
  alg_dict[key] = value

for key, value in alg_3.items():
  alg_dict[key] = value

print(f"alg_dict dictionary length: {len(alg_dict)}")
random_sample = random.sample(list(alg_dict.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

alg_dict dictionary length: 18
Matrix Groups:
['General Linear Group', 'Heisenberg Group', 'Lie-Type Group', 'Linear Algebraic Group', "Maschke's Theorem", 'Matrix Group', 'Orthogonal Group', 'Rotation Group', 'Special Linear Group', 'Special Orthogonal Group', 'Special Unitary Group', 'Symplectic Group']
Determinants:
['Casoratian', "Cauchy's Determinant Theorem", 'Cayley-Menger Determinant', 'Chió Pivotal Condensation', 'Circulant Determinant', 'Cofactor', 'Condensation', 'Determinant', 'Determinant Expansion by Minors', 'Determinant Identities', 'Determinant Theorem', 'Gram Determinant', "Gram's Inequality", "Hadamard's Maximum Determinant Problem", "Hadamard's Theorem", 'Hafner-Sarnak-McCurley Constant', 'Hessian', 'Hill Determinant', 'Hyperdeterminant', 'Inversion Number', "Jacobi's Determinant Identity", "Jacobi's Theorem", 'Jacobian', 'Mills-Robbins-Rumsey Determinant Formula', 'Minor', 'PfaffianPivotal Condensation', "Schweins's Theorem", 'Stäckel Determinant', "Sylvester's Det

In [14]:
# Merged to all_topics dict
all_topics = dict()
for key, value in alg_dict.items():
  all_topics[key] = value

for key, value in topics.items():
  all_topics[key] = value

all_topics['Linear Algebra'] = all_topics['General Linear Algebra']
del all_topics['General Linear Algebra']

print(f"all_topics dictionary length: {len(all_topics)}")
random_sample = random.sample(list(all_topics.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

all_topics dictionary length: 28
Linear Algebra:
['Alternating Multilinear Form', 'Bilinear Basis', 'Complex Vector Space', "Fredholm's Theorem", 'Fundamental Matrix Subspaces', 'Fundamental Theorem of Linear Algebra', 'Haar Condition', 'Hermitian Inner Product', 'Invertible Linear Map', 'Kernel', 'Linear Algebra', 'Linear Combination', 'Linear Function', 'Linear Transformation Kernel', 'Linearly Independent', 'Lorentzian Inner Product', 'Null Space', 'Nullity', 'Nullspace', 'Orthogonal Complement', 'Orthogonal Set', 'Orthogonal Sum', 'Orthogonal Transformation', 'Orthogonality Condition', 'Orthonormal Basis', 'Orthonormal Set', 'Piecewise Linear Function', 'Quotient Vector Space', 'Rank-Nullity Theorem', 'Real Vector Space', 'Reduced Whitehead Group', 'Vector Space Flag', 'Vector Space Orientation', 'Vector Space Projection', 'Whitehead Group']
Linear Systems of Equations:
['Basis Vector', 'Change of Basis', 'Change of Coordinates Matrix', "Cramer's Rule", 'Gauss-Seidel Method', 'Jaco

In [15]:
# Check 1: 1 mapping of keys values
my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

map_to = dict()
for key, value_list in my_dict.items():
  if len(value_list) == 2:
    if 'Recreational Mathematics' in value_list:
      value_list.remove('Recreational Mathematics')
      map_to[key] = value_list[0]

# Create a list of keys to delete to avoid changing dictionary size during iteration
keys_to_delete = [key for key in my_dict if key in map_to]

# Delete the keys
for key in keys_to_delete:
  del my_dict[key]
  # Delete from all_topics
  all_topics['Recreational Mathematics'].remove(key)

In [16]:
# Choose linear algebra topic classification over other types
all_topics_keys_to_remove = dict()
keys_to_delete = list()

for key, value_list in my_dict.items():
  if len(value_list) == 2:
    if 'Determinants' in value_list:
      value_list.remove('Determinants')
      all_topics_keys_to_remove[value_list[0]] = key
      keys_to_delete.append(key)

# Delete the keys
for key in keys_to_delete:
  del my_dict[key]

for key, value in all_topics_keys_to_remove.items():
  # Delete from all_topics
  all_topics[key].remove(value)

In [17]:
# Additional cleaning
my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

In [18]:
# Additional cleaning
to_remove = {
    "Fundamental Matrix Subspaces": ['Matrix Decomposition', 'Matrix Operations'],
    "Matrix Diagonalization": ['Matrix Decomposition', 'Matrix Eigenvalues'],
    "Stochastic Matrix": ['Algebra', 'Probability and Statistics'],
    "Hadamard Matrix": ['Algebra', 'Foundations of Mathematics', 'Recreational Mathematics', 'Discrete Mathematics'],
    "Polynomial Matrix": [ 'Algebra', 'Calculus and Analysis'],
    "Sylvester Matrix": ['Algebra', 'Calculus and Analysis'],
    "Matrix Polynomial": ['Algebra', 'Calculus and Analysis']
    }

for key, value_list in to_remove.items():
  for val in value_list:
    all_topics[val].remove(key)

my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

In [19]:
# Additional cleaning - duplicate definitions
to_remove = {
    "Lie Algebra": ['Lie Theory', 'Lie Algebra']
    ,"Fredholm's Theorem": ['Linear Independence', 'Matrix Properties']
    ,"Jacobi Identities": ['Lie Algebra', 'Lie Groups']
    ,"Flag Manifold": ['Lie Groups', 'Calculus and Analysis']
    ,"Lie-Type Group": ['Lie Groups', 'Discrete Mathematics']
    ,"Special Linear Group": ['Lie Groups', 'Discrete Mathematics']
    ,"Fundamental Theorem of Linear Algebra": ['Matrix Decomposition', 'Matrix Operations']
    ,"Schur Decomposition": ['Matrix Eigenvalues', 'Applied Mathematics']
    ,"Conjugate Transpose": ['Matrix Types', 'Calculus and Analysis']
    ,"Analytic Torsion": ['Calculus and Analysis', 'Topology']
    ,"Byzantine Generals Problem": ['Algebra', 'Applied Mathematics']
    ,"Code": ['Algebra', 'Foundations of Mathematics']
    ,"Commutative Diagram": ['Algebra', 'Foundations of Mathematics']
    ,"Contractible": ['Foundations of Mathematics', 'Topology']
    ,"Cyclotomic Polynomial": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Elliptic Curves": ['Algebra', 'Geometry']
    ,"Encoding": ['Algebra', 'Foundations of Mathematics']
    ,"Endomorphism": ['Algebra', 'Foundations of Mathematics', 'Applied Mathematics', 'Geometry']
    ,"Ergodic Theory": ['Algebra', 'Applied Mathematics', 'Geometry', 'Calculus and Analysis']
    ,"Gerbe": [ 'Foundations of Mathematics', 'Topology']
    ,"Gröbner Basis": ['Algebra', 'Discrete Mathematics']
    ,"Icosahedral Equation": ['Algebra', 'Calculus and Analysis']
    ,"Kac Formula": ['Algebra', 'Probability and Statistics']
    ,"Minimax Polynomial": ['Probability and Statistics', 'Calculus and Analysis']
    ,"Modularity Theorem": ['Algebra', 'Number Theory']
    ,"Noise": ['Algebra', 'Applied Mathematics']
    ,"Normal Polynomial": ['Calculus and Analysis', 'Number Theory']
    ,"Octahedral Equation": ['Geometry', 'Calculus and Analysis']
    ,"Polynomial Sequence": ['Algebra', 'Calculus and Analysis']
    ,"Power Polynomial": ['Calculus and Analysis', 'Discrete Mathematics']
    ,"Real Vector Space": ['Algebra', 'Topology']
    ,"Reidemeister Torsion": ['Calculus and Analysis', 'Topology']
    ,"Taniyama-Shimura Conjecture": ['Algebra', 'Topology']
    ,"Tetrahedral Equation": ['Algebra', 'Geometry']
    ,"Vector Field": ['Applied Mathematics', 'Topology']
    ,"Whitehead Torsion": ['Algebra', 'Calculus and Analysis']
    ,"Zero Map": ['Algebra', 'Foundations of Mathematics', 'Calculus and Analysis']
    ,"Borel Hierarchy": ['Foundations of Mathematics', 'Topology']
    ,"Busy Beaver": ['Foundations of Mathematics', 'Recreational Mathematics']
    ,"Circle Packing": ['Foundations of Mathematics', 'Recreational Mathematics', 'Discrete Mathematics']
    ,"Closed Map": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Collatz Problem": ['Foundations of Mathematics', 'Recreational Mathematics']
    ,"Complete Metric Space": ['Foundations of Mathematics', 'Topology']
    ,"Contingent Cone": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Dense": ['Foundations of Mathematics', 'Topology']
    ,"Domain": ['Foundations of Mathematics', 'Calculus and Analysis', 'Topology']
    ,"Endpoint": ['Foundations of Mathematics', 'Geometry']
    ,"F_sigma Set": ['Foundations of Mathematics', 'Topology']
    ,"First Category": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"G_delta Set": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Generalized Reeb Component": ['Foundations of Mathematics', 'Topology']
    ,"Helly's Theorem": ['Foundations of Mathematics', 'Geometry']
    ,"Hofstadter's Q-Sequence": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Hofstadter-Conway $10,000 Sequence": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Injection": ['Foundations of Mathematics', 'Topology']
    ,"Lebesgue Covering Dimension": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Logistic Map": ['Foundations of Mathematics', 'Applied Mathematics', 'Discrete Mathematics']
    ,"Logistic Map--r=2": ['Foundations of Mathematics', 'Applied Mathematics']
    ,"Logistic Map--r=4": ['Foundations of Mathematics', 'Applied Mathematics']
    ,"Metric Topology": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Metrizable Topology": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Net": ['Foundations of Mathematics', 'Probability and Statistics']
    ,"Nice Vector Field": ['Foundations of Mathematics', 'Topology']
    ,"Pseudoconcave Function": ['Foundations of Mathematics', 'Topology']
    ,"Pseudoconvex Function": ['Foundations of Mathematics', 'Topology']
    ,"Quasi-Concave Function": ['Foundations of Mathematics', 'Topology']
    ,"Quasi-Convex Function": ['Foundations of Mathematics', 'Topology']
    ,"Reeb Component": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Reebless": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Stochastic": ['Foundations of Mathematics', 'Applied Mathematics', 'Geometry', 'Number Theory']
    ,"Surjection": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Sutured Manifold": ['Foundations of Mathematics', 'Topology']
    ,"Taut Foliation": ['Foundations of Mathematics', 'Topology']
    ,"Transversely Orientable Foliation": ['Foundations of Mathematics', 'Topology']
    ,"Uniform Convexity": ['Foundations of Mathematics', 'Topology']
    ,"Zero Set": ['Foundations of Mathematics', 'Topology']
    ,"Lévy Flight": ['Probability and Statistics', 'Applied Mathematics', 'Geometry']
    ,"Lévy Walk": ['Probability and Statistics', 'Applied Mathematics', 'Geometry']
    ,"Quantum Stochastic Calculus": ['Probability and Statistics', 'Geometry', 'Number Theory']
    ,"Sampling": ['Applied Mathematics', 'Discrete Mathematics']
    ,"Seed": ['Probability and Statistics', 'Calculus and Analysis']
    ,"2x mod 1 Map": ['Calculus and Analysis', 'Discrete Mathematics']
    ,"Anosov Flow": ['Calculus and Analysis', 'Topology']
    ,"Axiom A Flow": ['Calculus and Analysis', 'Topology']
    ,"Cake Cutting": ['Geometry', 'Discrete Mathematics']
    ,"Dye's Theorem": ['Applied Mathematics', 'Geometry']
    ,"Ergodic Measure": ['Applied Mathematics', 'Geometry']
    ,"Geodesic Flow": ['Calculus and Analysis', 'Topology']
    ,"KMS Condition": ['Applied Mathematics', 'Geometry', 'Calculus and Analysis']
    ,"Kronecker-Weyl Theorem": ['Applied Mathematics', 'Geometry']
    ,"Kubo-Martin-Schwinger Condition": ['Geometry', 'Calculus and Analysis', 'Number Theory']
    ,"Lévy Process": ['Geometry', 'Number Theory']
    ,"Number Partitioning Problem": ['Applied Mathematics', 'Geometry']
    ,"Ornstein's Theorem": ['Geometry', 'Number Theory']
    ,"Pesin Theory": ['Geometry', 'Number Theory']
    ,"Phase Space": ['Applied Mathematics', 'Topology']
    ,"Poisson Process": ['Geometry', 'Number Theory']
    ,"Smale Horseshoe Map": ['Applied Mathematics', 'Calculus and Analysis']
    ,"Connex": ['Geometry', 'Calculus and Analysis']
    ,"Dehn Invariant": ['Geometry', 'Recreational Mathematics']
    ,"Integer Triangle": ['Geometry', 'Recreational Mathematics']
    ,"Kobon Triangle": ['Geometry', 'Recreational Mathematics']
    ,"Pizza Theorem": ['Geometry', 'Recreational Mathematics']
    ,"Subset Sum Problem": ['Geometry', 'Discrete Mathematics']
    ,"Banach Space": ['Recreational Mathematics', 'Topology']
    ,"Hilbert Space": ['Recreational Mathematics', 'Topology']
    ,"Perrin Sequence": ['Recreational Mathematics', 'Discrete Mathematics']
    ,"Pi": ['Recreational Mathematics', 'Discrete Mathematics']
}

for key, value_list in to_remove.items():
  for val in value_list:
    all_topics[val].remove(key)

my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

# Final 1:1 check
my_dict = dict()
for key, value_list in my_dict.items():
  if len(value_list) != 1:
    print(f"{key}: {value_list}")

In [20]:
# Final Processing of NER Tags
def capitalise_key(key):
  # Capitalizes the first word in a string and lowers the rest for consistent key formatting
  words = key.split()
  if len(words) == 1:
    to_return = words[0].capitalize()
  else:
    first = words[0].capitalize()
    rest = words[1:]
    to_return = ' '.join(rest).lower()
    to_return = f"{first} {to_return}"
  return to_return

def apostrophe_s_replacement(match):
  # Fixes changes in case which leads to results eg "segner'S recurrence formula"
  word = match.group(1)
  # Preserve the case for fully uppercase words
  if word.isupper():
    return word + "'S"
  else:
    return word + "'s"

In [21]:
# Regex pattern to match words with special characters
special_char_pattern = re.compile(r"[^\w\s'-]")

In [22]:
def creates_bioes_labels(ner_mapper_list, key):
  # Generate BIOES-style labels (Beginning, Inside, Outside, End, Single) for a list of values, corresponding to the input. helps label the tokens in a sequence to identify the boundaries of named entities.
  n = len(ner_mapper_list)
  if n == 1:
    labels_list = [f'S-{key}']
  else:
    labels_list = [f'B-{key}']  # Labels start with 'B-ENTITY_NAME'
    labels_list.extend([f'I-{key}'] * (n - 2))  # middle tokens are 'I-ENTITY_NAME'
    labels_list.append(f'E-{key}')  # End with 'E-ENTITY_NAME'
  return labels_list

In [23]:
def generate_ner_tags(classification_dict):
  # Generates hierarchical tags based on the key
  transformations = {
        "lower": str.lower,
        "upper": str.upper,
        "title": str.title,
        "capitalise": capitalise_key,
  }
  pattern = r"(\b[a-zA-Z]+)'[sS]"
  my_dict = dict()
  for key, value_list in classification_dict.items():
    # key = key.replace(" ", "_").upper()
    for value in value_list:
      transformed_values = {}
      if special_char_pattern.search(value):
        continue
      for name, transform_fn in transformations.items():
        _ = transform_fn(value)
        transformed_values[name] = re.sub(pattern, apostrophe_s_replacement, _)

      for value in transformed_values.values():
        if value in ["SIERPIŃSKI SIEVE", "ERDŐS-ANNING THEOREM", "BÔCHER'S THEOREM", "DOBIŃSKI'S FORMULA", "MIKUSIŃSKI'S PROBLEM", "ERDŐS-IVIĆ CONJECTURE", "ERDŐS-MOSER EQUATION", "ERDŐS-STRAUS CONJECTURE", "ERDŐS-SZEKERES THEOREM",
                     "ERDŐS-TURÁN CONJECTURE", "ERDŐS-TURÁN DISCREPANCY BOUND"]:
          continue
        value = value.replace("\n", " ") #Removing new line in key
        value = value.replace("--", "-") #Removing double-dashes
        value = value.replace("_", " ") #Removing underscores

        text_index = idx2string(value)
        token_list = tag_tokenizer(value).tokens()
        input_id_list = tag_tokenizer.encode(value)
        token_id_index = idx2token_and_id(text_index, token_list, input_id_list)
        my_dict[value] = {
          "text_index": text_index,
          "tokens" : token_list,
          "input_ids" : input_id_list,
          }
        ner_mapper = [tup[1] for key, tup in token_id_index.items()]
        for i, (k, v) in enumerate(my_dict[value]['text_index'].items()):
          my_dict[value]['ner_mapper'] = ner_mapper
          my_dict[value]['ner_tags'] = creates_bioes_labels(ner_mapper, key)
          if len(my_dict[value]['ner_mapper']) != len(my_dict[value]['ner_tags']):
            print(f"Error: {value}")
  return my_dict

In [24]:
# Final NER Tag Generation
_all_classifications = generate_ner_tags(all_topics)

In [25]:
print(f"topic_dict_ner dictionary length: {len(_all_classifications)}")
random_sample = random.sample(list(_all_classifications.keys()), 3)

for key in random_sample:
  print(f"{key}:\n{_all_classifications[key]}")

topic_dict_ner dictionary length: 21506
Bivariate polynomial:
{'text_index': {0: 'Bivariate', 1: 'polynomial'}, 'tokens': ['[CLS]', 'B', '##iva', '##ria', '##te', 'polynomial', '[SEP]'], 'input_ids': [101, 139, 12416, 3464, 1566, 19068, 102], 'ner_mapper': [139, 19068], 'ner_tags': ['B-Calculus and Analysis', 'E-Calculus and Analysis']}
CUMULATIVE SUM:
{'text_index': {0: 'CUMULATIVE', 1: 'SUM'}, 'tokens': ['[CLS]', 'C', '##UM', '##U', '##LA', '##TI', '##VE', 'S', '##UM', '[SEP]'], 'input_ids': [101, 140, 25810, 2591, 10783, 21669, 17145, 156, 25810, 102], 'ner_mapper': [140, 156], 'ner_tags': ['B-Algebra', 'E-Algebra']}
field characteristic exponent:
{'text_index': {0: 'field', 1: 'characteristic', 2: 'exponent'}, 'tokens': ['[CLS]', 'field', 'characteristic', 'ex', '##po', '##nent', '[SEP]'], 'input_ids': [101, 1768, 7987, 4252, 5674, 21222, 102], 'ner_mapper': [1768, 7987, 4252], 'ner_tags': ['B-Algebra', 'I-Algebra', 'E-Algebra']}


In [26]:
# Final cleaning
all_classifications = dict()
full_ner_tags = []

for key, sub_dict in _all_classifications.items():
  ner_mapper = sub_dict['ner_mapper'].copy()
  ner_tags = sub_dict['ner_tags'].copy()
  full_ner_tags = []

  for i in sub_dict['input_ids']:
    if i == ner_mapper[0]:
      full_ner_tags.append(ner_tags[0])
      ner_mapper = ner_mapper[1:]
      ner_tags = ner_tags[1:]
      if len(ner_mapper) == 0:
        ner_mapper = ['-']
        ner_tags = ['-']
    else:
      full_ner_tags.append('IGN')
      # full_ner_tags.append('O')

  assert len(full_ner_tags) == len(sub_dict['input_ids']), f"Error: {key}"

  all_classifications[key] = {
        'text_index': sub_dict['text_index'],
        'tokens': sub_dict['tokens'],
        'input_ids': sub_dict['input_ids'],
        'ner_mapper': sub_dict['ner_mapper'],
        'ner_tags': full_ner_tags,
    }

In [27]:
print(f"topic_dict_ner dictionary length: {len(all_classifications)}")

for key in random_sample:
  print(f"{key}:\n{all_classifications[key]}")

topic_dict_ner dictionary length: 21506
Bivariate polynomial:
{'text_index': {0: 'Bivariate', 1: 'polynomial'}, 'tokens': ['[CLS]', 'B', '##iva', '##ria', '##te', 'polynomial', '[SEP]'], 'input_ids': [101, 139, 12416, 3464, 1566, 19068, 102], 'ner_mapper': [139, 19068], 'ner_tags': ['IGN', 'B-Calculus and Analysis', 'IGN', 'IGN', 'IGN', 'E-Calculus and Analysis', 'IGN']}
CUMULATIVE SUM:
{'text_index': {0: 'CUMULATIVE', 1: 'SUM'}, 'tokens': ['[CLS]', 'C', '##UM', '##U', '##LA', '##TI', '##VE', 'S', '##UM', '[SEP]'], 'input_ids': [101, 140, 25810, 2591, 10783, 21669, 17145, 156, 25810, 102], 'ner_mapper': [140, 156], 'ner_tags': ['IGN', 'B-Algebra', 'IGN', 'IGN', 'IGN', 'IGN', 'IGN', 'E-Algebra', 'IGN', 'IGN']}
field characteristic exponent:
{'text_index': {0: 'field', 1: 'characteristic', 2: 'exponent'}, 'tokens': ['[CLS]', 'field', 'characteristic', 'ex', '##po', '##nent', '[SEP]'], 'input_ids': [101, 1768, 7987, 4252, 5674, 21222, 102], 'ner_mapper': [1768, 7987, 4252], 'ner_tags': ['

In [28]:
# Final results saved as a pickle file of classified topics and NER tags
file_name = 'all_classifications.pkl'

with open(file_name, 'wb') as file:
  pickle.dump(all_classifications, file)