In [None]:
# Setup
! pip install fuzzywuzzy
! pip install intervaltree

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting intervaltree
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sortedcontainers<3.0,>=2.0 (from intervaltree)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Building wheels for collected packages: intervaltree
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26097 sha256=72a9c584b02358b7b2fc60bc5e3504129ecc6841a0676ed867abd7bec5167880
  Stored in directory: /root/.cache/pip/wheels/31/d7/d9/eec6891f78cac19a693bd40ecb8365d2f4613318c145ec9816
Successfully built intervaltree
Installing collecte

In [None]:
# Library imports
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
from collections import defaultdict
from fuzzywuzzy import fuzz
from intervaltree import Interval, IntervalTree
from collections import Counter
import json
from openai import OpenAI
import pandas as pd
from tqdm import tqdm



In [None]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [None]:
# See all rows, columns in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Utility functions
# Define nested_dict function
def nested_dict():
  return defaultdict(nested_dict)

# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

# Split CamelCase to s e p a r a t e d text
def split_camel_case(text):
    # Use regex to insert a space before uppercase letters preceded by lowercase letters
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

In [None]:
# Read in dictionary with definitions
definitions = read_pickle("short_answer_dict.pkl")
all_definitions = read_pickle("final_combined_dict.pkl")

## Checking existing NER capability

In [None]:
# Existing tags from DistilBERT
tag_checkpoint = "dslim/distilbert-NER"
tag_tokenizer = AutoTokenizer.from_pretrained(tag_checkpoint, do_lower_case=False)
tag_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)

tags = pipeline("ner", model=tag_model, tokenizer=tag_tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
tag_tokenizer.is_fast

True

In [None]:
# Utility functions built from DistilBERT NER

def idx2string(text):
    # Split on whitespace or punctuation, punctuation is used for tokens
    words = re.findall(r'\b\w+\b|[^\s\w]', text)
    my_dict = {i: word for i, word in enumerate(words)}
    return my_dict

def tokenidx2words(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    word, index = sub_dict['word'], sub_dict['index']
    my_dict[index] = word
  return my_dict

def tokenidx2entity(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    entity, index = sub_dict['entity'], sub_dict['index']
    my_dict[index] = entity
  return my_dict

def idx2wordpos(text, idx2string):
  my_dict = {}
  current_pos = 0
  for idx, word in idx2string.items():
    start = text.index(word, current_pos)
    end = start + len(word)
    my_dict[idx] = (start, end)
    current_pos = end
  return my_dict

def tokenidx2entityword(example2wordidx, ner_results):
  # Create an IntervalTree
  tree = IntervalTree(Interval(start, end, key) for key, (start, end) in example2wordidx.items())

  my_dict = dict()
  for sub_dict in ner_results:
    tag_start, tag_end = sub_dict['start'], sub_dict['end']
    tag_entity = sub_dict['entity']
    token_idx = sub_dict['index']

    overlapping_intervals = tree[tag_start : tag_end] # Intervals which contain the entitry recognised
    for interval in overlapping_intervals:
      my_dict[token_idx] = {'entity' : tag_entity, 'word_idx': interval.data}

  return my_dict

def baseline_tags_list(input_ids, baseline_tags):
  start_entities = list(len(input_ids)*'O')
  for i in baseline_tags:
    start_entities[i['index']] = i['entity']
    assert len(start_entities) == len(input_ids)
  return start_entities

In [None]:
# Example run 1

example = "Benford's Law states that in many naturally occurring datasets, the leading digit is more likely to be small, with the number 1 appearing as the first digit more frequently than larger digits, following a predictable logarithmic distribution."
print(f"{example}\n")
example2idx = idx2string(example)
print(f"word index: {example2idx}")

example2wordidx = idx2wordpos(example, example2idx)
print(f"word start-end index: {example2wordidx}")

ner_results = tags(example)
print(f"tags: {ner_results}")

tokenized = tag_tokenizer(example, padding=True, truncation=True)

input_ids = tokenized['input_ids']
print(f"input_ids: {len(input_ids)}, {input_ids}")

mapper_dict = tokenidx2entityword(example2wordidx, ner_results)
print(f"mapper_dict: {mapper_dict}")

baseline_tags = baseline_tags_list(input_ids, ner_results)
print(f"baseline_tags: {len(baseline_tags)}, {baseline_tags}")

attention_mask = tokenized['attention_mask']
print(f"attention_mask: {len(attention_mask)}, {attention_mask}")

tokens = tokenized.tokens()
print(f"tokens: {len(tokens)}, {tokens}")

Benford's Law states that in many naturally occurring datasets, the leading digit is more likely to be small, with the number 1 appearing as the first digit more frequently than larger digits, following a predictable logarithmic distribution.

word index: {0: 'Benford', 1: "'", 2: 's', 3: 'Law', 4: 'states', 5: 'that', 6: 'in', 7: 'many', 8: 'naturally', 9: 'occurring', 10: 'datasets', 11: ',', 12: 'the', 13: 'leading', 14: 'digit', 15: 'is', 16: 'more', 17: 'likely', 18: 'to', 19: 'be', 20: 'small', 21: ',', 22: 'with', 23: 'the', 24: 'number', 25: '1', 26: 'appearing', 27: 'as', 28: 'the', 29: 'first', 30: 'digit', 31: 'more', 32: 'frequently', 33: 'than', 34: 'larger', 35: 'digits', 36: ',', 37: 'following', 38: 'a', 39: 'predictable', 40: 'logarithmic', 41: 'distribution', 42: '.'}
word start-end index: {0: (0, 7), 1: (7, 8), 2: (8, 9), 3: (10, 13), 4: (14, 20), 5: (21, 25), 6: (26, 28), 7: (29, 33), 8: (34, 43), 9: (44, 53), 10: (54, 62), 11: (62, 63), 12: (64, 67), 13: (68, 75), 

In [None]:
# Example run 2

example = "A Hankel matrix is a type of structured matrix where each ascending skew-diagonal from left to right is constant. It is characterized by its entries being functions of a single variable, forming a pattern where each element depends only on the sum of its indices. Hankel matrices are often used in signal processing and control theory for polynomial and sequence analysis."
print(f"{example}\n")
example2idx = idx2string(example)
print(f"word index: {example2idx}")

example2wordidx = idx2wordpos(example, example2idx)
print(f"word start-end index: {example2wordidx}")

ner_results = tags(example)
print(f"tags: {ner_results}")

tokenized = tag_tokenizer(example, padding=True, truncation=True)

input_ids = tokenized['input_ids']
print(f"input_ids: {len(input_ids)}, {input_ids}")

mapper_dict = tokenidx2entityword(example2wordidx, ner_results)
print(f"mapper_dict: {mapper_dict}")

baseline_tags = baseline_tags_list(input_ids, ner_results)
print(f"baseline_tags: {len(baseline_tags)}, {baseline_tags}")

attention_mask = tokenized['attention_mask']
print(f"attention_mask: {len(attention_mask)}, {attention_mask}")

tokens = tokenized.tokens()
print(f"tokens: {len(tokens)}, {tokens}")

A Hankel matrix is a type of structured matrix where each ascending skew-diagonal from left to right is constant. It is characterized by its entries being functions of a single variable, forming a pattern where each element depends only on the sum of its indices. Hankel matrices are often used in signal processing and control theory for polynomial and sequence analysis.

word index: {0: 'A', 1: 'Hankel', 2: 'matrix', 3: 'is', 4: 'a', 5: 'type', 6: 'of', 7: 'structured', 8: 'matrix', 9: 'where', 10: 'each', 11: 'ascending', 12: 'skew', 13: '-', 14: 'diagonal', 15: 'from', 16: 'left', 17: 'to', 18: 'right', 19: 'is', 20: 'constant', 21: '.', 22: 'It', 23: 'is', 24: 'characterized', 25: 'by', 26: 'its', 27: 'entries', 28: 'being', 29: 'functions', 30: 'of', 31: 'a', 32: 'single', 33: 'variable', 34: ',', 35: 'forming', 36: 'a', 37: 'pattern', 38: 'where', 39: 'each', 40: 'element', 41: 'depends', 42: 'only', 43: 'on', 44: 'the', 45: 'sum', 46: 'of', 47: 'its', 48: 'indices', 49: '.', 50: 

In [None]:
# Model requires entities to be cased to recognise them

## Encoding the definitions

In [None]:
# Dictionary with existing tags - these are to be updated to the new custom tags
dict_file_name = 'preliminary_ner_results.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    preliminary_ner_results = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")

  preliminary_ner_results = dict()

  for key, value in definitions.items():
    text_index = idx2string(value)
    tokenized = tag_tokenizer(value, padding=True, truncation=True)
    input_ids = tokenized['input_ids']
    ner_results = tags(value)
    preliminary_ner_results[key] = {
        "text" : value,
        "text_index": text_index,
        "text_start_end_index": idx2wordpos(value, text_index),
        "tokens" : tokenized.tokens(),
        "baseline_tags" : baseline_tags_list(input_ids, ner_results),
        "input_ids" : input_ids,
        "attention_mask" : tokenized['attention_mask']
        }

  with open(dict_file_name, 'wb') as file:
    pickle.dump(preliminary_ner_results, file)

In [None]:
# Checks
for key in preliminary_ner_results.keys():
  if len(preliminary_ner_results[key]["input_ids"]) != len(preliminary_ner_results[key]["baseline_tags"]):
    print(key)

In [None]:
print(f"dictionary length: {len(preliminary_ner_results)}")

print(f"Pythagorean theorem:")
for k, v in preliminary_ner_results["Pythagorean theorem"].items():
  print(f"{k}:\n   {v}")

dictionary length: 5388
Pythagorean theorem:
text:
   The Pythagorean Theorem describes the relationship between the sides of a right triangle, stating that the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides. This principle is fundamental in geometry and is used to determine distances and measurements in various applications. It highlights the intrinsic connection between the angles and sides of a right triangle.
text_index:
   {0: 'The', 1: 'Pythagorean', 2: 'Theorem', 3: 'describes', 4: 'the', 5: 'relationship', 6: 'between', 7: 'the', 8: 'sides', 9: 'of', 10: 'a', 11: 'right', 12: 'triangle', 13: ',', 14: 'stating', 15: 'that', 16: 'the', 17: 'square', 18: 'of', 19: 'the', 20: 'length', 21: 'of', 22: 'the', 23: 'hypotenuse', 24: 'is', 25: 'equal', 26: 'to', 27: 'the', 28: 'sum', 29: 'of', 30: 'the', 31: 'squares', 32: 'of', 33: 'the', 34: 'lengths', 35: 'of', 36: 'the', 37: 'other', 38: 'two', 39: 'sides', 40: '.', 41:

In [None]:
# Creating an exceptions dictionary with prepositions, conjunctions to bypass in NER
exceptions = ['and', 'or', 'but', 'nor', 'for', 'so', 'yet', 'because', 'although', 'since', 'while', 'if', 'either', 'neither', 'both', 'not', 'only', 'also', 'about', 'above', 'across', 'after',
              'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in',
              'inside', 'into', 'like', 'means', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over', 'past', 'since', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until',
              'up', 'upon', 'with', 'within', 'without']

exception_input_ids = [tag_tokenizer.encode(exception)[1] for exception in exceptions]

## Encoding Named Entity Recognition Dictionary

In [None]:
# Read in mapped dictionary with NER
all_classifications = read_pickle("all_classifications.pkl")

In [None]:
print(f"all_classifications dictionary length: {len(all_classifications)}")
random_sample = random.sample(list(all_classifications.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_classifications dictionary length: 21506
combinator:
 {'text_index': {0: 'combinator'}, 'tokens': ['[CLS]', 'comb', '##inator', '[SEP]'], 'input_ids': [101, 27481, 24682, 102], 'ner_mapper': [27481], 'ner_tags': ['IGN', 'S-Foundations of Mathematics', 'IGN', 'IGN']}
Tubular Neighborhood:
 {'text_index': {0: 'Tubular', 1: 'Neighborhood'}, 'tokens': ['[CLS]', 'Tu', '##bula', '##r', 'N', '##ei', '##gh', '##bor', '##hood', '[SEP]'], 'input_ids': [101, 17037, 23601, 1197, 151, 6851, 5084, 12207, 5914, 102], 'ner_mapper': [17037, 151], 'ner_tags': ['IGN', 'B-Topology', 'IGN', 'IGN', 'E-Topology', 'IGN', 'IGN', 'IGN', 'IGN', 'IGN']}
liouville's constant:
 {'text_index': {0: 'liouville', 1: "'", 2: 's', 3: 'constant'}, 'tokens': ['[CLS]', 'l', '##io', '##u', '##ville', "'", 's', 'constant', '[SEP]'], 'input_ids': [101, 181, 2660, 1358, 2138, 112, 188, 4836, 102], 'ner_mapper': [181, 112, 188, 4836], 'ner_tags': ['IGN', 'B-Number Theory', 'IGN', 'IGN', 'IGN', 'I-Number Theory', 'I-Number Th

In [None]:
# Update tags to ignore exceptions such as prepositions and conjunctions
for key, sub_dict in all_classifications.items():
  input_ids = sub_dict['input_ids']
  ner_tags = sub_dict['ner_tags']
  for i in range(len(input_ids)):
    if input_ids[i] in exception_input_ids:
      ner_tags[i] = 'O'
  sub_dict['ner_tags'] = ner_tags

In [None]:
# Trim off input_ids 101 and 102 as these are the start and stop ids
input_id_mapping = dict()

for sub_dict in all_classifications.values():
  key = tuple(sub_dict['input_ids'][1:-1])
  value = sub_dict['ner_tags'][1:-1]
  assert len(key) == len(value), 'Error'
  input_id_mapping.update({key : value})

In [None]:
# Can also match on fuzzy-matched keys between the two eg "pythagoras's theorem" and "Pythagorean theorem"
dict_file_name = 'fuzzy_key_match_results.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    fuzz_dict = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")

  fuzz_dict = dict()
  all_classifications_keys = list(all_classifications.keys())

  for key in tqdm(preliminary_ner_results.keys(), desc="Fuzzy Match key", unit="key"):
    my_dict = dict()
    for i in all_classifications_keys:
      fuzz_ratio = fuzz.ratio(key, i)
      if fuzz_ratio > 80 and fuzz_ratio < 100:
        my_dict.update({i : fuzz_ratio})
    fuzz_dict.update({key : my_dict})

  with open(dict_file_name, 'wb') as file:
    pickle.dump(fuzz_dict, file)

In [None]:
print(f"fuzzy_key_match_results dictionary length: {len(fuzz_dict)}")
random_sample = random.sample(list(fuzz_dict.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

fuzzy_key_match_results dictionary length: 5388
Absolute Moment:
 {'absolute moment': 87, 'Absolute moment': 93}
Apéry's Constant Continued Fraction:
 {"apéry's constant continued fraction": 89, "Apéry'S Constant Continued Fraction": 97, "Apéry's constant continued fraction": 91, "Catalan's Constant Continued Fraction": 83, 'Copeland-Erdős Constant Continued Fraction': 81, "Khinchin's Constant Continued Fraction": 82, "Soldner's Constant Continued Fraction": 86}
Minkowski Inner Product:
 {'minkowski inner product': 87, 'Minkowski inner product': 91}


In [None]:
# To increase the training dictionary size, get definitions for these fuzzy matches and append them to existing preliminary_ner_results dictionary
with open('fuzzy_gpt_responses.pkl', 'rb') as file:
  fuzzy_gpt_responses = pickle.load(file)

get_definition = []

for key, value_dict in fuzz_dict.items():
  if key not in fuzzy_gpt_responses.keys():
    get_definition.append(key)
  for k in value_dict.keys():
    if k not in fuzzy_gpt_responses.keys():
      get_definition.append(k)

get_definition = list(set(get_definition))

In [None]:
# Get the API key and set the model name
model = "gpt-4o-mini"
client = OpenAI(api_key=userdata.get('ChatGPT'))

# Function to return answers based on prompt
def gpt_answers(topic):
  completion = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": f"""I will ask you to provide a short summary of a topic, 2 or 3 sentences long. Output must not include mathematical notation or history.
                  Example:

                  Question:
                  Markov chain

                  Answer:
                  A Markov Chain is a process where the next step depends only on the current state, not the path taken to get there. It consists of a set of states and the probabilities of transitioning from one state to another. This makes it a simple way to model systems that evolve step by step with memory only of the present.

                  End of example. """}, # system message that provides context to the model
      {"role": "user", "content": f"""Provide a short summary of: {topic}"""}  # topic query
    ]
  )

  response = completion.choices[0].message.content
  return response

In [None]:
# try:
#   # Load existing responses if the file exists
#   with open('fuzzy_gpt_responses.pkl', 'rb') as file:
#     fuzzy_gpt_responses = pickle.load(file)
# except (FileNotFoundError, EOFError):
#   fuzzy_gpt_responses = dict()

# Iterate over definitions and save each response as it's created
for definition in tqdm(get_definition, desc="Getting definitions", unit="definition"):
  if definition not in fuzzy_gpt_responses.keys():
    fuzzy_gpt_responses[definition] = gpt_answers(definition)
    with open('fuzzy_gpt_responses.pkl', 'wb') as file:
      pickle.dump(fuzzy_gpt_responses, file)

Getting definitions: 0definition [00:00, ?definition/s]


In [None]:
print(f"fuzzy_gpt_responses dictionary length: {len(fuzzy_gpt_responses)}")
random_sample = random.sample(list(fuzzy_gpt_responses.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

fuzzy_gpt_responses dictionary length: 15659
Self-adjoint element:
 A self-adjoint element is an operator or matrix equal to its own adjoint or conjugate transpose. In quantum mechanics and functional analysis, self-adjoint elements are significant because they represent observable quantities that have real eigenvalues. This property ensures that measurements yield real-number results, which are essential for physical interpretations.
Palindromic Prime:
 A palindromic prime is a type of prime number that remains the same when its digits are reversed, making it a palindrome. In addition to being prime, these numbers exhibit symmetry, which makes them interesting in number theory. Examples include 2, 3, 5, 7, 11, and 101.
Survivorship Curve:
 A survivorship curve is a graphical representation that shows the number of individuals surviving at each age for a given species or population. It typically illustrates the probability of survival over time and can be categorized into three main ty

In [None]:
# Encode new definitions
dict_file_name = 'ner_fuzzy_encoded.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    ner_fuzzy_encoded = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")
  start_time = time.time()

  ner_fuzzy_encoded = dict()

  for i, (key, value) in enumerate(fuzzy_gpt_responses.items()):
    if key not in ner_fuzzy_encoded:
      text_index = idx2string(value)
      tokenized = tag_tokenizer(value, padding=True, truncation=True)
      input_ids = tokenized['input_ids']
      ner_results = tags(value)
      ner_fuzzy_encoded[key] = {
          "text" : value,
          "text_index": text_index,
          "text_start_end_index": idx2wordpos(value, text_index),
          "tokens" : tokenized.tokens(),
          "baseline_tags" : baseline_tags_list(input_ids, ner_results),
          "input_ids" : input_ids,
          "attention_mask" : tokenized['attention_mask']
          }

    if i % 1000 == 0 or i == len(fuzzy_gpt_responses) - 1:
      print(f"Processed {i+1} definitions, saving to file...")
      end_time = time.time()
      elapsed_time = (end_time - start_time) / 60
      print(f"***Execution time: {elapsed_time:.2f} minutes")
      with open(dict_file_name, 'wb') as file:
        pickle.dump(ner_fuzzy_encoded, file)

In [None]:
print(f"ner_fuzzy_encoded dictionary length: {len(ner_fuzzy_encoded)}")
random_sample = random.sample(list(ner_fuzzy_encoded.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

ner_fuzzy_encoded dictionary length: 15659
Von neumann neighborhood:
 {'text': 'A Von Neumann neighborhood refers to a specific way of considering the adjacent positions surrounding a cell in a two-dimensional grid, focusing on the four orthogonal directions: up, down, left, and right. In this neighborhood model, only the neighboring cells that share a side with the target cell are included, making it useful in cellular automata and other grid-based systems. This concept helps in analyzing interactions and evolution patterns in discrete space structures.', 'text_index': {0: 'A', 1: 'Von', 2: 'Neumann', 3: 'neighborhood', 4: 'refers', 5: 'to', 6: 'a', 7: 'specific', 8: 'way', 9: 'of', 10: 'considering', 11: 'the', 12: 'adjacent', 13: 'positions', 14: 'surrounding', 15: 'a', 16: 'cell', 17: 'in', 18: 'a', 19: 'two', 20: '-', 21: 'dimensional', 22: 'grid', 23: ',', 24: 'focusing', 25: 'on', 26: 'the', 27: 'four', 28: 'orthogonal', 29: 'directions', 30: ':', 31: 'up', 32: ',', 33: 'down', 

In [None]:
# Merge all data into single dictionary
all_data = dict()

for key_1, value_1 in ner_fuzzy_encoded.items():
  all_data.update({key_1 : value_1})

for key_2, value_2 in preliminary_ner_results.items(): #preliminary_ner_results
  if key_2 not in all_data:
    all_data.update({key_2 : value_2})

In [None]:
print(f"all_data dictionary length: {len(all_data)}")
random_sample = random.sample(list(all_data.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_data dictionary length: 15659
mex sequence:
 {'text': 'The mex sequence, or minimum excluded value sequence, is a mathematical concept where the next term in the sequence is the smallest non-negative integer that is not present in the current set of terms. This sequence is useful in combinatorial games and theoretical computer science for analyzing possible outcomes and strategies. It highlights the importance of tracking which values have been "used" in a given context.', 'text_index': {0: 'The', 1: 'mex', 2: 'sequence', 3: ',', 4: 'or', 5: 'minimum', 6: 'excluded', 7: 'value', 8: 'sequence', 9: ',', 10: 'is', 11: 'a', 12: 'mathematical', 13: 'concept', 14: 'where', 15: 'the', 16: 'next', 17: 'term', 18: 'in', 19: 'the', 20: 'sequence', 21: 'is', 22: 'the', 23: 'smallest', 24: 'non', 25: '-', 26: 'negative', 27: 'integer', 28: 'that', 29: 'is', 30: 'not', 31: 'present', 32: 'in', 33: 'the', 34: 'current', 35: 'set', 36: 'of', 37: 'terms', 38: '.', 39: 'This', 40: 'sequence', 41: '

In [None]:
def find_sublist_indices(test_list, sub_list):
  n, m = len(test_list), len(sub_list)
  indices = []
  for i in range(n - m + 1):
    if test_list[i:i + m] == sub_list:
      indices.append(list(range(i, i + m)))
  return indices if indices else None  # Return None if no match is found

In [None]:
# Sort the input_id_mapping dictionary by the length of the keys in descending order
sorted_data = dict(sorted(input_id_mapping.items(), key=lambda item: len(item[0]), reverse=False))
input_id_mapping = sorted_data

In [None]:
dict_file_name = 'all_data_matches.pkl'
list_file_name = 'non_matches.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    all_data = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")
  start_time = time.time()

  non_matches = set()

  for i, (key, sub_dict) in enumerate(tqdm(all_data.items(), desc="Processing keys", unit="key")):
    all_data_input_ids = sub_dict['input_ids']
    updated_tags = sub_dict['baseline_tags'].copy()

    # Only keep existing B, I-LOC and B, I-ORG tags
    updated_tags = ['O' if tag not in ['B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'] else tag for tag in updated_tags]
    my_list = []

    for classification_tuple, classification_ner in input_id_mapping.items():
      to_find = list(classification_tuple)
      idx_result = find_sublist_indices(all_data_input_ids, to_find)

      if idx_result:
        idx_list = [i for item in idx_result for i in item]
        my_list.append(idx_list)
        if len(idx_list) > len(classification_ner):
          classification_ner_list = list(len(idx_list)*classification_ner)
        if len(idx_list) == len(classification_ner):
          classification_ner_list = classification_ner
        if len(idx_list) < len(classification_ner):
          print(f"Error")
        for idx, classification in zip(idx_list, classification_ner):
          updated_tags[idx] = classification
      else:
        non_matches.add(key)

    ner_indices = [i for item in my_list for i in item]
    all_data[key]['ner_indices'] = ner_indices

    # Code 101 and 102 to 'IGN'
    for start_end_idx, input_ids in enumerate(all_data_input_ids):
      if input_ids in [101, 102]:
        updated_tags[start_end_idx] = 'IGN'

    # Final tag formatting
    updated_tags = [text.upper() for text in updated_tags]
    updated_tags = [text.replace(" ", "-") for text in updated_tags]
    all_data[key]['ner_tags'] = updated_tags

    if i % 1000 == 0 or i == len(all_data) - 1:
      print(f"\nProcessed {i+1} of {len(all_data)}, saving to file...")
      end_time = time.time()
      elapsed_time = (end_time - start_time)/60
      print(f" ***Execution time: {elapsed_time:.2f} minutes")
      with open(dict_file_name, 'wb') as file:
        pickle.dump(all_data, file)
      with open(list_file_name, 'wb') as file:
        pickle.dump(non_matches, file)

In [None]:
print(f"all_data dictionary length: {len(all_data)}")
random_sample = random.sample(list(all_data.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_data dictionary length: 15659
Automorphic forms:
 {'text': 'Automorphic forms are sophisticated mathematical functions defined on homogeneous spaces, which exhibit invariance under the action of a group, often related to number theory and geometry. They generalize classical modular forms and play a crucial role in various areas of mathematics, including representation theory and algebraic geometry. These forms are instrumental in understanding the relationships between different mathematical structures and have applications in arithmetic and the theory of automorphic representations.', 'text_index': {0: 'Automorphic', 1: 'forms', 2: 'are', 3: 'sophisticated', 4: 'mathematical', 5: 'functions', 6: 'defined', 7: 'on', 8: 'homogeneous', 9: 'spaces', 10: ',', 11: 'which', 12: 'exhibit', 13: 'invariance', 14: 'under', 15: 'the', 16: 'action', 17: 'of', 18: 'a', 19: 'group', 20: ',', 21: 'often', 22: 'related', 23: 'to', 24: 'number', 25: 'theory', 26: 'and', 27: 'geometry', 28: '.', 29:

In [None]:
for_df = ['tokens', 'input_ids', 'baseline_tags', 'ner_tags', 'attention_mask']
my_list = []
for i in for_df:
  my_list.append(pd.DataFrame(all_data[random_sample[0][0]][i], columns=[i]))

df = pd.concat(my_list, axis=1)
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83
tokens,[CLS],Auto,##morphic,forms,are,sophisticated,mathematical,functions,defined,on,ho,##mogeneous,spaces,",",which,exhibit,in,##var,##iance,under,the,action,of,a,group,",",often,related,to,number,theory,and,geometry,.,They,general,##ize,classical,modular,forms,and,play,a,crucial,role,in,various,areas,of,mathematics,",",including,representation,theory,and,algebraic,geometry,.,These,forms,are,instrumental,in,understanding,the,relationships,between,different,mathematical,structures,and,have,applications,in,arithmetic,and,the,theory,of,auto,##morphic,representations,.,[SEP]
input_ids,101,12983,24285,2769,1132,12580,9988,4226,3393,1113,16358,28008,6966,117,1134,8245,1107,8997,19425,1223,1103,2168,1104,170,1372,117,1510,2272,1106,1295,2749,1105,12053,119,1220,1704,3708,4521,24407,2769,1105,1505,170,10268,1648,1107,1672,1877,1104,6686,117,1259,6368,2749,1105,19669,12053,119,1636,2769,1132,6338,1107,4287,1103,6085,1206,1472,9988,4413,1105,1138,4683,1107,24205,1105,1103,2749,1104,12365,24285,16539,119,102
baseline_tags,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
ner_tags,IGN,B-NUMBER-THEORY,IGN,E-NUMBER-THEORY,O,O,O,S-CALCULUS-AND-ANALYSIS,O,O,O,O,S-TOPOLOGY,O,O,O,O,O,O,O,O,S-CALCULUS-AND-ANALYSIS,O,O,O,O,O,O,O,B-NUMBER-THEORY,E-NUMBER-THEORY,O,S-GEOMETRY,O,O,O,O,O,B-NUMBER-THEORY,E-NUMBER-THEORY,O,O,O,O,O,O,O,O,O,S-FOUNDATIONS-OF-MATHEMATICS,O,O,B-ALGEBRA,E-ALGEBRA,O,B-GEOMETRY,E-GEOMETRY,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,S-NUMBER-THEORY,O,O,O,O,O,O,O,O,IGN
attention_mask,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
