<a href="https://colab.research.google.com/github/HeatherDriver/MathGraph/blob/main/03_NER_Case_Tag_Dictionary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
! pip install intervaltree



In [49]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from google.colab import drive, userdata
import pickle
import random
import re
from collections import defaultdict
from intervaltree import Interval, IntervalTree
from collections import Counter
import json

In [50]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [51]:
# Define nested_dict function
def nested_dict():
  return defaultdict(nested_dict)

# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

# Split CamelCase to s e p a r a t e d text
def split_camel_case(text):
    # Use regex to insert a space before uppercase letters preceded by lowercase letters
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

In [52]:
# Existing tags from DistilBERT
tag_checkpoint = "dslim/distilbert-NER"
tag_tokenizer = AutoTokenizer.from_pretrained(tag_checkpoint, do_lower_case=False)
tag_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)

tags = pipeline("ner", model=tag_model, tokenizer=tag_tokenizer)

Device set to use cpu


In [53]:
tag_tokenizer.is_fast

True

In [54]:
# Methods built from DistilBERT NER

def idx2string(text):
    # Split on whitespace or punctuation, punctuation is used for tokens
    words = re.findall(r'\b\w+\b|[^\s\w]', text)
    my_dict = {i: word for i, word in enumerate(words)}
    return my_dict

def tokenidx2words(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    word, index = sub_dict['word'], sub_dict['index']
    my_dict[index] = word
  return my_dict

def tokenidx2entity(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    entity, index = sub_dict['entity'], sub_dict['index']
    my_dict[index] = entity
  return my_dict

def idx2wordpos(text, idx2string):
  my_dict = {}
  current_pos = 0
  for idx, word in idx2string.items():
    start = text.index(word, current_pos)
    end = start + len(word)
    my_dict[idx] = (start, end)
    current_pos = end
  return my_dict

def tokenidx2entityword(example2wordidx, ner_results):
  # Create an IntervalTree
  tree = IntervalTree(Interval(start, end, key) for key, (start, end) in example2wordidx.items())

  my_dict = dict()
  for sub_dict in ner_results:
    tag_start, tag_end = sub_dict['start'], sub_dict['end']
    tag_entity = sub_dict['entity']
    token_idx = sub_dict['index']

    overlapping_intervals = tree[tag_start : tag_end] # Intervals which contain the entitry recognised
    for interval in overlapping_intervals:
      my_dict[token_idx] = {'entity' : tag_entity, 'word_idx': interval.data}

  return my_dict

In [55]:
# Get naming convention from scraped data
topics = read_pickle("topics.pkl")

print(f"topic_dict dictionary length: {len(topics)}")
random_sample = random.sample(list(topics.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

topic_dict dictionary length: 10
Foundations of Mathematics:
['A New Kind of Science', 'Axioms', 'Category Theory', 'Logic', 'Mathematical Problems', 'Point-Set Topology', 'Set Theory', 'Theorem Proving']
Number Theory:
['Algebraic Number Theory', 'Arithmetic', 'Automorphic Forms', 'Binary Sequences', 'Class Numbers', 'Congruences', 'Constants', 'Continued Fractions', 'Diophantine Equations', 'Divisors', 'Elliptic Curves', 'Ergodic Theory', 'General Number Theory', 'Generating Functions', 'Integer Relations', 'Integers', 'Irrational Numbers', 'Normal Numbers', 'Numbers', 'Number Theoretic Functions', 'Parity', 'Prime Numbers', 'p-adic Numbers', 'Rational Approximation', 'Rational Numbers', 'Real Numbers', 'Reciprocity Theorems', 'Rounding', 'Sequences', 'Special Numbers', 'Transcendental Numbers']
Calculus and Analysis:
['Calculus', 'Calculus of Variations', 'Catastrophe Theory', 'Complex Analysis', 'Differential Equations', 'Differential Forms', 'Differential Geometry', 'Dynamical Sys

In [56]:
# Get naming convention from scraped data
sub_topics = read_pickle("sub_topics.pkl")

my_dict = dict()
for key, value in sub_topics.items():
  split_key = split_camel_case(key)
  my_dict[split_key] = value

sub_topics = my_dict

print(f"sub_topics dictionary length: {len(sub_topics)}")

for key, value in random_sample:
  print(f"{key}:\n{sub_topics[key]}")

sub_topics dictionary length: 10
Foundations of Mathematics:
['Ackermann Function', 'Additive Category', 'Additive Cellular Automaton', 'AND', 'Apollonian Gasket', 'Associative Algebra', 'Binary', 'Boolean Function', 'Bootstrap Percolation', 'Busy Beaver', 'Causal Graph', 'Causal Invariance', 'Causal Network', 'Cayley Graph', 'Cellular Automaton', 'Champernowne Constant', 'Church-Rosser Property', 'Church-Turing Thesis', 'Circle Packing', 'Closed-Form Solution', 'Code', 'Code 177', 'Code 912', 'Code 2040', 'Collatz Problem', 'Combinator', 'Commutative Monoid', 'Complex Number', 'Complexity', 'Computable Number', 'Computational Irreducibility', 'Computational Paradigm', 'Computational Reducibility', 'Confluent', 'Consecutive Number Sequences', "Conway's Constant", 'Conway Sequence', 'Cosmological Theorem', 'Critical Pair', 'Cubic Symmetric Graph', 'Cyclic Tag System', 'Cyclotomic Polynomial', 'Decision Problem', 'Digit Count', 'Elementary Cellular Automaton', 'Encoding', 'Equational Log

In [57]:
# Merge Topics and Subtopics to one dictionary
for key, value in topics.items():
  if key in sub_topics and sub_topics[key] is not None:
    value.extend(sub_topics[key])
    # Convert to set and back to list to remove duplicates and sort
    new_value = list(set(value))
    new_value.sort()
    topics[key] = new_value

In [58]:
print(f"topics dictionary length: {len(topics)}")

for key, value in random_sample:
  print(f"{key}:\n{topics[key]}")

topics dictionary length: 10
Foundations of Mathematics:
['0-Connected', '1-Connected', 'A New Kind of Science', 'AND', 'Abelian Category', 'Absorption Identities', 'Absorption Identity', 'Absorption Law', 'Ackermann Function', 'Additive Category', 'Additive Cellular Automaton', 'Additive Functor', 'Affine Hull', 'Algebra of Random Variables', 'Algebraic Geometry Stack', 'Allegory', 'Almost Everywhere Convergence', 'Apollonian Gasket', 'Arc Component', 'Arcwise-Connected', 'Associative Algebra', 'Axiom', 'Axiom Schema', 'Axiom of Choice', 'Axiom of Extensionality', 'Axiom of Foundation', 'Axiom of Infinity', 'Axiom of Replacement', 'Axiom of Subsets', 'Axiom of the Empty Set', 'Axiom of the Power Set', 'Axiom of the Sum Set', 'Axiom of the Unordered Pair', 'Axiomatic Set Theory', 'Axiomatic System', 'Axioms', 'Axioms of Subsets', 'Basepoint', 'Basis Element', 'Bicollared', 'Binary', 'Boolean Function', 'Bootstrap Percolation', 'Borel Hierarchy', 'Borel Measure', 'Borel Set', 'Borel Sig

In [59]:
alg_2 = read_pickle("alg_2.pkl")
alg_3 = read_pickle("alg_3.pkl")

# merge alg_2 and alg_3 into 1 dictionary
alg_dict = dict()

for key, value in alg_2.items():
  alg_dict[key] = value

for key, value in alg_3.items():
  alg_dict[key] = value

print(f"alg_dict dictionary length: {len(alg_dict)}")
random_sample = random.sample(list(alg_dict.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

alg_dict dictionary length: 18
Determinants:
['Casoratian', "Cauchy's Determinant Theorem", 'Cayley-Menger Determinant', 'Chió Pivotal Condensation', 'Circulant Determinant', 'Cofactor', 'Condensation', 'Determinant', 'Determinant Expansion by Minors', 'Determinant Identities', 'Determinant Theorem', 'Gram Determinant', "Gram's Inequality", "Hadamard's Maximum Determinant Problem", "Hadamard's Theorem", 'Hafner-Sarnak-McCurley Constant', 'Hessian', 'Hill Determinant', 'Hyperdeterminant', 'Inversion Number', "Jacobi's Determinant Identity", "Jacobi's Theorem", 'Jacobian', 'Mills-Robbins-Rumsey Determinant Formula', 'Minor', 'PfaffianPivotal Condensation', "Schweins's Theorem", 'Stäckel Determinant', "Sylvester's Determinant Identity", 'Vandermonde Determinant']
Matrix Operations:
['Antihermitian Part', 'Antisymmetric Part', 'Conjugate Transpose', 'Conjugate Transpose Matrix', 'Echelon Form', 'Elementary Row and Column Operations', 'Fundamental Matrix Subspaces', 'Fundamental Theorem of 

In [60]:
all_topics = dict()
for key, value in alg_dict.items():
  all_topics[key] = value

for key, value in topics.items():
  all_topics[key] = value

all_topics['Linear Algebra'] = all_topics['General Linear Algebra']
del all_topics['General Linear Algebra']

print(f"all_topics dictionary length: {len(all_topics)}")
random_sample = random.sample(list(all_topics.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

all_topics dictionary length: 28
Determinants:
['Casoratian', "Cauchy's Determinant Theorem", 'Cayley-Menger Determinant', 'Chió Pivotal Condensation', 'Circulant Determinant', 'Cofactor', 'Condensation', 'Determinant', 'Determinant Expansion by Minors', 'Determinant Identities', 'Determinant Theorem', 'Gram Determinant', "Gram's Inequality", "Hadamard's Maximum Determinant Problem", "Hadamard's Theorem", 'Hafner-Sarnak-McCurley Constant', 'Hessian', 'Hill Determinant', 'Hyperdeterminant', 'Inversion Number', "Jacobi's Determinant Identity", "Jacobi's Theorem", 'Jacobian', 'Mills-Robbins-Rumsey Determinant Formula', 'Minor', 'PfaffianPivotal Condensation', "Schweins's Theorem", 'Stäckel Determinant', "Sylvester's Determinant Identity", 'Vandermonde Determinant']
Matrix Properties:
['Augmented Matrix', 'Bandwidth', 'Bourque-Ligh Conjecture', 'Cayley-Hamilton Theorem', 'Characteristic Equation', 'Combinatorial Matrix Theory', 'Condition Number', 'Diagonal', 'Diagonalizable Matrix', "Fred

In [61]:
# Check 1: 1 mapping of keys values
my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

map_to = dict()
for key, value_list in my_dict.items():
  if len(value_list) == 2:
    if 'Recreational Mathematics' in value_list:
      value_list.remove('Recreational Mathematics')
      map_to[key] = value_list[0]

# Create a list of keys to delete to avoid changing dictionary size during iteration
keys_to_delete = [key for key in my_dict if key in map_to]

# Delete the keys
for key in keys_to_delete:
  del my_dict[key]
  # Delete from all_topics
  all_topics['Recreational Mathematics'].remove(key)

In [62]:
# Choose linear algebra topic classification over other types
all_topics_keys_to_remove = dict()
keys_to_delete = list()

for key, value_list in my_dict.items():
  if len(value_list) == 2:
    if 'Determinants' in value_list:
      value_list.remove('Determinants')
      all_topics_keys_to_remove[value_list[0]] = key
      keys_to_delete.append(key)

# Delete the keys
for key in keys_to_delete:
  del my_dict[key]

for key, value in all_topics_keys_to_remove.items():
  # Delete from all_topics
  all_topics[key].remove(value)

In [63]:
my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

In [64]:
to_remove = {
    "Fundamental Matrix Subspaces": ['Matrix Decomposition', 'Matrix Operations'],
    "Matrix Diagonalization": ['Matrix Decomposition', 'Matrix Eigenvalues'],
    "Stochastic Matrix": ['Algebra', 'Probability and Statistics'],
    "Hadamard Matrix": ['Algebra', 'Foundations of Mathematics', 'Recreational Mathematics', 'Discrete Mathematics'],
    "Polynomial Matrix": [ 'Algebra', 'Calculus and Analysis'],
    "Sylvester Matrix": ['Algebra', 'Calculus and Analysis'],
    "Matrix Polynomial": ['Algebra', 'Calculus and Analysis']
    }

for key, value_list in to_remove.items():
  for val in value_list:
    all_topics[val].remove(key)

my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

In [65]:
to_remove = {
    "Lie Algebra": ['Lie Theory', 'Lie Algebra']
    ,"Fredholm's Theorem": ['Linear Independence', 'Matrix Properties']
    ,"Jacobi Identities": ['Lie Algebra', 'Lie Groups']
    ,"Flag Manifold": ['Lie Groups', 'Calculus and Analysis']
    ,"Lie-Type Group": ['Lie Groups', 'Discrete Mathematics']
    ,"Special Linear Group": ['Lie Groups', 'Discrete Mathematics']
    ,"Fundamental Theorem of Linear Algebra": ['Matrix Decomposition', 'Matrix Operations']
    ,"Schur Decomposition": ['Matrix Eigenvalues', 'Applied Mathematics']
    ,"Conjugate Transpose": ['Matrix Types', 'Calculus and Analysis']
    ,"Analytic Torsion": ['Calculus and Analysis', 'Topology']
    ,"Byzantine Generals Problem": ['Algebra', 'Applied Mathematics']
    ,"Code": ['Algebra', 'Foundations of Mathematics']
    ,"Commutative Diagram": ['Algebra', 'Foundations of Mathematics']
    ,"Contractible": ['Foundations of Mathematics', 'Topology']
    ,"Cyclotomic Polynomial": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Elliptic Curves": ['Algebra', 'Geometry']
    ,"Encoding": ['Algebra', 'Foundations of Mathematics']
    ,"Endomorphism": ['Algebra', 'Foundations of Mathematics', 'Applied Mathematics', 'Geometry']
    ,"Ergodic Theory": ['Algebra', 'Applied Mathematics', 'Geometry', 'Calculus and Analysis']
    ,"Gerbe": [ 'Foundations of Mathematics', 'Topology']
    ,"Gröbner Basis": ['Algebra', 'Discrete Mathematics']
    ,"Icosahedral Equation": ['Algebra', 'Calculus and Analysis']
    ,"Kac Formula": ['Algebra', 'Probability and Statistics']
    ,"Minimax Polynomial": ['Probability and Statistics', 'Calculus and Analysis']
    ,"Modularity Theorem": ['Algebra', 'Number Theory']
    ,"Noise": ['Algebra', 'Applied Mathematics']
    ,"Normal Polynomial": ['Calculus and Analysis', 'Number Theory']
    ,"Octahedral Equation": ['Geometry', 'Calculus and Analysis']
    ,"Polynomial Sequence": ['Algebra', 'Calculus and Analysis']
    ,"Power Polynomial": ['Calculus and Analysis', 'Discrete Mathematics']
    ,"Real Vector Space": ['Algebra', 'Topology']
    ,"Reidemeister Torsion": ['Calculus and Analysis', 'Topology']
    ,"Taniyama-Shimura Conjecture": ['Algebra', 'Topology']
    ,"Tetrahedral Equation": ['Algebra', 'Geometry']
    ,"Vector Field": ['Applied Mathematics', 'Topology']
    ,"Whitehead Torsion": ['Algebra', 'Calculus and Analysis']
    ,"Zero Map": ['Algebra', 'Foundations of Mathematics', 'Calculus and Analysis']
    ,"Borel Hierarchy": ['Foundations of Mathematics', 'Topology']
    ,"Busy Beaver": ['Foundations of Mathematics', 'Recreational Mathematics']
    ,"Circle Packing": ['Foundations of Mathematics', 'Recreational Mathematics', 'Discrete Mathematics']
    ,"Closed Map": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Collatz Problem": ['Foundations of Mathematics', 'Recreational Mathematics']
    ,"Complete Metric Space": ['Foundations of Mathematics', 'Topology']
    ,"Contingent Cone": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Dense": ['Foundations of Mathematics', 'Topology']
    ,"Domain": ['Foundations of Mathematics', 'Calculus and Analysis', 'Topology']
    ,"Endpoint": ['Foundations of Mathematics', 'Geometry']
    ,"F_sigma Set": ['Foundations of Mathematics', 'Topology']
    ,"First Category": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"G_delta Set": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Generalized Reeb Component": ['Foundations of Mathematics', 'Topology']
    ,"Helly's Theorem": ['Foundations of Mathematics', 'Geometry']
    ,"Hofstadter's Q-Sequence": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Hofstadter-Conway $10,000 Sequence": ['Foundations of Mathematics', 'Discrete Mathematics']
    ,"Injection": ['Foundations of Mathematics', 'Topology']
    ,"Lebesgue Covering Dimension": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Logistic Map": ['Foundations of Mathematics', 'Applied Mathematics', 'Discrete Mathematics']
    ,"Logistic Map--r=2": ['Foundations of Mathematics', 'Applied Mathematics']
    ,"Logistic Map--r=4": ['Foundations of Mathematics', 'Applied Mathematics']
    ,"Metric Topology": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Metrizable Topology": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Net": ['Foundations of Mathematics', 'Probability and Statistics']
    ,"Nice Vector Field": ['Foundations of Mathematics', 'Topology']
    ,"Pseudoconcave Function": ['Foundations of Mathematics', 'Topology']
    ,"Pseudoconvex Function": ['Foundations of Mathematics', 'Topology']
    ,"Quasi-Concave Function": ['Foundations of Mathematics', 'Topology']
    ,"Quasi-Convex Function": ['Foundations of Mathematics', 'Topology']
    ,"Reeb Component": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Reebless": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Stochastic": ['Foundations of Mathematics', 'Applied Mathematics', 'Geometry', 'Number Theory']
    ,"Surjection": ['Foundations of Mathematics', 'Calculus and Analysis']
    ,"Sutured Manifold": ['Foundations of Mathematics', 'Topology']
    ,"Taut Foliation": ['Foundations of Mathematics', 'Topology']
    ,"Transversely Orientable Foliation": ['Foundations of Mathematics', 'Topology']
    ,"Uniform Convexity": ['Foundations of Mathematics', 'Topology']
    ,"Zero Set": ['Foundations of Mathematics', 'Topology']
    ,"Lévy Flight": ['Probability and Statistics', 'Applied Mathematics', 'Geometry']
    ,"Lévy Walk": ['Probability and Statistics', 'Applied Mathematics', 'Geometry']
    ,"Quantum Stochastic Calculus": ['Probability and Statistics', 'Geometry', 'Number Theory']
    ,"Sampling": ['Applied Mathematics', 'Discrete Mathematics']
    ,"Seed": ['Probability and Statistics', 'Calculus and Analysis']
    ,"2x mod 1 Map": ['Calculus and Analysis', 'Discrete Mathematics']
    ,"Anosov Flow": ['Calculus and Analysis', 'Topology']
    ,"Axiom A Flow": ['Calculus and Analysis', 'Topology']
    ,"Cake Cutting": ['Geometry', 'Discrete Mathematics']
    ,"Dye's Theorem": ['Applied Mathematics', 'Geometry']
    ,"Ergodic Measure": ['Applied Mathematics', 'Geometry']
    ,"Geodesic Flow": ['Calculus and Analysis', 'Topology']
    ,"KMS Condition": ['Applied Mathematics', 'Geometry', 'Calculus and Analysis']
    ,"Kronecker-Weyl Theorem": ['Applied Mathematics', 'Geometry']
    ,"Kubo-Martin-Schwinger Condition": ['Geometry', 'Calculus and Analysis', 'Number Theory']
    ,"Lévy Process": ['Geometry', 'Number Theory']
    ,"Number Partitioning Problem": ['Applied Mathematics', 'Geometry']
    ,"Ornstein's Theorem": ['Geometry', 'Number Theory']
    ,"Pesin Theory": ['Geometry', 'Number Theory']
    ,"Phase Space": ['Applied Mathematics', 'Topology']
    ,"Poisson Process": ['Geometry', 'Number Theory']
    ,"Smale Horseshoe Map": ['Applied Mathematics', 'Calculus and Analysis']
    ,"Connex": ['Geometry', 'Calculus and Analysis']
    ,"Dehn Invariant": ['Geometry', 'Recreational Mathematics']
    ,"Integer Triangle": ['Geometry', 'Recreational Mathematics']
    ,"Kobon Triangle": ['Geometry', 'Recreational Mathematics']
    ,"Pizza Theorem": ['Geometry', 'Recreational Mathematics']
    ,"Subset Sum Problem": ['Geometry', 'Discrete Mathematics']
    ,"Banach Space": ['Recreational Mathematics', 'Topology']
    ,"Hilbert Space": ['Recreational Mathematics', 'Topology']
    ,"Perrin Sequence": ['Recreational Mathematics', 'Discrete Mathematics']
    ,"Pi": ['Recreational Mathematics', 'Discrete Mathematics']
}

for key, value_list in to_remove.items():
  for val in value_list:
    all_topics[val].remove(key)

my_dict = dict()
for key, value_list in all_topics.items():
  for value in value_list:
    if value not in my_dict.keys():
      my_dict[value] = [key]
    else:
      my_dict[value].extend([key])

# Final 1:1 check
my_dict = dict()
for key, value_list in my_dict.items():
  if len(value_list) != 1:
    print(f"{key}: {value_list}")

In [66]:
def capitalise_key(key):
  words = key.split()
  if len(words) == 1:
    to_return = words[0].capitalize()
  else:
    first = words[0].capitalize()
    rest = words[1:]
    to_return = ' '.join(rest).lower()
    to_return = f"{first} {to_return}"
  return to_return

In [67]:
def creates_bioes_labels(ner_mapper_list, key):
  # Generate BIOES-style labels for a list of values, corresponding to the input.
  n = len(ner_mapper_list)
  if n == 1:
    labels_list = [f'S-{key}']
  else:
    labels_list = [f'B-{key}']  # Labels start with 'B-ENTITY_NAME'
    labels_list.extend([f'I-{key}'] * (n - 2))  # middle tokens are 'I-ENTITY_NAME'
    labels_list.append(f'E-{key}')  # End with 'E-ENTITY_NAME'
  return labels_list

In [68]:
# def generate_ner_tags(classification_dict):
#   # Generates hierarchical tags based on the key
#   transformations = {
#         "lower": str.lower,
#         "upper": str.upper,
#         "title": str.title,
#         "capitalise": capitalise_key,
#   }
#   my_dict = dict()
#   for key, value_list in classification_dict.items():
#     key = key.replace(" ", "_").upper()
#     for value in value_list:
#       transformed_values = {}
#       for name, transform_fn in transformations.items():
#         transformed_values[name] = transform_fn(value)

#       for value in transformed_values.values():
#         value = value.replace("\n", " ") #Removing new line in key
#         value = value.replace("--", "-") #Removing double-dashes

#         text_index = idx2string(value)
#         my_dict[value] = {
#           "text_index": text_index,
#           "tokens" : tag_tokenizer(value).tokens(),
#           "input_ids" : tag_tokenizer.encode(value),
#           }
#         my_list = []
#         for k, v in my_dict[value]['text_index'].items():
#           for token, input_id in zip(my_dict[value]['tokens'], my_dict[value]['input_ids']):
#             if v.startswith(token):
#               my_list.append(input_id)
#           my_dict[value]['ner_mapper'] = my_list
#           my_dict[value]['ner_tags'] = creates_bioes_labels(my_list, key)
#   return my_dict

In [106]:
def generate_ner_tags(classification_dict):
  # Generates hierarchical tags based on the key
  transformations = {
        "lower": str.lower,
        "upper": str.upper,
        "title": str.title,
        "capitalise": capitalise_key,
  }
  my_dict = dict()
  for key, value_list in classification_dict.items():
    key = key.replace(" ", "_").upper()
    for value in value_list:
      transformed_values = {}
      for name, transform_fn in transformations.items():
        transformed_values[name] = transform_fn(value)

      for value in transformed_values.values():
        value = value.replace("\n", " ") #Removing new line in key
        value = value.replace("--", "-") #Removing double-dashes

        text_index = idx2string(value)
        my_dict[value] = {
          "text_index": text_index,
          "tokens" : tag_tokenizer(value).tokens(),
          "input_ids" : tag_tokenizer.encode(value),
          }
        my_list = []
        for i, (k, v) in enumerate(my_dict[value]['text_index'].items()):
          for token, input_id in zip(my_dict[value]['tokens'], my_dict[value]['input_ids']):
            if v.startswith(token):
              my_list.append(input_id)
              continue
          my_dict[value]['ner_mapper'] = my_list
          my_dict[value]['ner_tags'] = creates_bioes_labels(my_list, key)
  return my_dict

In [107]:
# Generate the NER tags per dictionary
_all_classifications = generate_ner_tags(all_topics)

In [108]:
print(f"topic_dict_ner dictionary length: {len(_all_classifications)}")
random_sample = random.sample(list(_all_classifications.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n{value}")

topic_dict_ner dictionary length: 21696
Actuarial Polynomial:
{'text_index': {0: 'Actuarial', 1: 'Polynomial'}, 'tokens': ['[CLS]', 'Act', '##ua', '##rial', 'Pol', '##yn', '##omi', '##al', '[SEP]'], 'input_ids': [101, 2173, 6718, 13119, 17129, 5730, 18882, 1348, 102], 'ner_mapper': [2173, 17129], 'ner_tags': ['B-DISCRETE_MATHEMATICS', 'E-DISCRETE_MATHEMATICS']}
event space:
{'text_index': {0: 'event', 1: 'space'}, 'tokens': ['[CLS]', 'event', 'space', '[SEP]'], 'input_ids': [101, 1856, 2000, 102], 'ner_mapper': [1856, 2000], 'ner_tags': ['B-PROBABILITY_AND_STATISTICS', 'E-PROBABILITY_AND_STATISTICS']}
Probability Axioms:
{'text_index': {0: 'Probability', 1: 'Axioms'}, 'tokens': ['[CLS]', 'Pro', '##ba', '##bility', 'A', '##xi', '##oms', '[SEP]'], 'input_ids': [101, 5096, 2822, 5474, 138, 8745, 17112, 102], 'ner_mapper': [5096, 138], 'ner_tags': ['B-PROBABILITY_AND_STATISTICS', 'E-PROBABILITY_AND_STATISTICS']}


In [114]:
all_classifications = dict()

for key, sub_dict in _all_classifications.items():
  ner = list('O' * len(sub_dict['input_ids']))

  for match_input_id, ner_tag in zip(sub_dict['ner_mapper'], sub_dict['ner_tags']):
    idx = sub_dict['input_ids'].index(match_input_id)
    ner[idx] = ner_tag
    ner = [i.replace('_', '-') for i in ner]
    all_classifications[key] = {
        'text_index': sub_dict['text_index'],
        'tokens': sub_dict['tokens'],
        'input_ids': sub_dict['input_ids'],
        'ner_mapper': sub_dict['ner_mapper'],
        'ner_tags': ner,
    }

In [115]:
print(f"topic_dict_ner dictionary length: {len(all_classifications)}")

for key, value in random_sample:
  print(f"{key}:\n{all_classifications[key]}")

topic_dict_ner dictionary length: 21696
Actuarial Polynomial:
{'text_index': {0: 'Actuarial', 1: 'Polynomial'}, 'tokens': ['[CLS]', 'Act', '##ua', '##rial', 'Pol', '##yn', '##omi', '##al', '[SEP]'], 'input_ids': [101, 2173, 6718, 13119, 17129, 5730, 18882, 1348, 102], 'ner_mapper': [2173, 17129], 'ner_tags': ['O', 'B-DISCRETE-MATHEMATICS', 'O', 'O', 'E-DISCRETE-MATHEMATICS', 'O', 'O', 'O', 'O']}
event space:
{'text_index': {0: 'event', 1: 'space'}, 'tokens': ['[CLS]', 'event', 'space', '[SEP]'], 'input_ids': [101, 1856, 2000, 102], 'ner_mapper': [1856, 2000], 'ner_tags': ['O', 'B-PROBABILITY-AND-STATISTICS', 'E-PROBABILITY-AND-STATISTICS', 'O']}
Probability Axioms:
{'text_index': {0: 'Probability', 1: 'Axioms'}, 'tokens': ['[CLS]', 'Pro', '##ba', '##bility', 'A', '##xi', '##oms', '[SEP]'], 'input_ids': [101, 5096, 2822, 5474, 138, 8745, 17112, 102], 'ner_mapper': [5096, 138], 'ner_tags': ['O', 'B-PROBABILITY-AND-STATISTICS', 'O', 'O', 'E-PROBABILITY-AND-STATISTICS', 'O', 'O', 'O']}


In [116]:
file_name = 'all_classifications.pkl'

with open(file_name, 'wb') as file:
  pickle.dump(all_classifications, file)