<a href="https://colab.research.google.com/github/HeatherDriver/MathGraph/blob/main/04_Named_Entity_Recognition_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install fuzzywuzzy
! pip install intervaltree

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting intervaltree
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sortedcontainers<3.0,>=2.0 (from intervaltree)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Building wheels for collected packages: intervaltree
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26097 sha256=cbab7ebd36ba1e7596be6f2e515325545311c56fcdfc5b04e3193397a0ca7074
  Stored in directory: /root/.cache/pip/wheels/fa/80/8c/43488a924a046b733b64de3fac99252674c892a4c3801c0a61
Successfully built intervaltree
Installing collecte

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoConfig, DistilBertForTokenClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
from collections import defaultdict
from fuzzywuzzy import fuzz
from intervaltree import Interval, IntervalTree
from collections import Counter
import json
from openai import OpenAI
import pandas as pd



In [3]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# Define nested_dict function
def nested_dict():
  return defaultdict(nested_dict)

# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

# Split CamelCase to s e p a r a t e d text
def split_camel_case(text):
    # Use regex to insert a space before uppercase letters preceded by lowercase letters
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

In [6]:
# Read in dictionary with definitions
definitions = read_pickle("short_answer_dict.pkl")
all_definitions = read_pickle("final_combined_dict.pkl")

## Checking existing NER capability

In [7]:
# Existing tags from DistilBERT
tag_checkpoint = "dslim/distilbert-NER"
tag_tokenizer = AutoTokenizer.from_pretrained(tag_checkpoint, do_lower_case=False)
tag_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)

tags = pipeline("ner", model=tag_model, tokenizer=tag_tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Device set to use cpu


In [8]:
tag_tokenizer.is_fast

True

In [13]:
# Methods built from DistilBERT NER

def idx2string(text):
    # Split on whitespace or punctuation, punctuation is used for tokens
    words = re.findall(r'\b\w+\b|[^\s\w]', text)
    my_dict = {i: word for i, word in enumerate(words)}
    return my_dict

def tokenidx2words(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    word, index = sub_dict['word'], sub_dict['index']
    my_dict[index] = word
  return my_dict

def tokenidx2entity(ner_results):
  my_dict = dict()
  for sub_dict in ner_results:
    entity, index = sub_dict['entity'], sub_dict['index']
    my_dict[index] = entity
  return my_dict

def idx2wordpos(text, idx2string):
  my_dict = {}
  current_pos = 0
  for idx, word in idx2string.items():
    start = text.index(word, current_pos)
    end = start + len(word)
    my_dict[idx] = (start, end)
    current_pos = end
  return my_dict

def tokenidx2entityword(example2wordidx, ner_results):
  # Create an IntervalTree
  tree = IntervalTree(Interval(start, end, key) for key, (start, end) in example2wordidx.items())

  my_dict = dict()
  for sub_dict in ner_results:
    tag_start, tag_end = sub_dict['start'], sub_dict['end']
    tag_entity = sub_dict['entity']
    token_idx = sub_dict['index']

    overlapping_intervals = tree[tag_start : tag_end] # Intervals which contain the entitry recognised
    for interval in overlapping_intervals:
      my_dict[token_idx] = {'entity' : tag_entity, 'word_idx': interval.data}

  return my_dict

def baseline_tags_list(input_ids, baseline_tags):
  start_entities = list(len(input_ids)*'O')
  for i in baseline_tags:
    start_entities[i['index']] = i['entity']
    assert len(start_entities) == len(input_ids)
  return start_entities

In [14]:
example = "A Hankel matrix is a type of structured matrix where each ascending skew-diagonal from left to right is constant. It is characterized by its entries being functions of a single variable, forming a pattern where each element depends only on the sum of its indices. Hankel matrices are often used in signal processing and control theory for polynomial and sequence analysis."
print(f"{example}\n")
example2idx = idx2string(example)
print(f"word index: {example2idx}")

example2wordidx = idx2wordpos(example, example2idx)
print(f"word start-end index: {example2wordidx}")

ner_results = tags(example)
print(f"tags: {ner_results}")

tokenized = tag_tokenizer(example, padding=True, truncation=True)

input_ids = tokenized['input_ids']
print(f"input_ids: {len(input_ids)}, {input_ids}")

mapper_dict = tokenidx2entityword(example2wordidx, ner_results)
print(f"mapper_dict: {mapper_dict}")

baseline_tags = baseline_tags_list(input_ids, ner_results)
print(f"baseline_tags: {len(baseline_tags)}, {baseline_tags}")

attention_mask = tokenized['attention_mask']
print(f"attention_mask: {len(attention_mask)}, {attention_mask}")

tokens = tokenized.tokens()
print(f"tokens: {len(tokens)}, {tokens}")

A Hankel matrix is a type of structured matrix where each ascending skew-diagonal from left to right is constant. It is characterized by its entries being functions of a single variable, forming a pattern where each element depends only on the sum of its indices. Hankel matrices are often used in signal processing and control theory for polynomial and sequence analysis.

word index: {0: 'A', 1: 'Hankel', 2: 'matrix', 3: 'is', 4: 'a', 5: 'type', 6: 'of', 7: 'structured', 8: 'matrix', 9: 'where', 10: 'each', 11: 'ascending', 12: 'skew', 13: '-', 14: 'diagonal', 15: 'from', 16: 'left', 17: 'to', 18: 'right', 19: 'is', 20: 'constant', 21: '.', 22: 'It', 23: 'is', 24: 'characterized', 25: 'by', 26: 'its', 27: 'entries', 28: 'being', 29: 'functions', 30: 'of', 31: 'a', 32: 'single', 33: 'variable', 34: ',', 35: 'forming', 36: 'a', 37: 'pattern', 38: 'where', 39: 'each', 40: 'element', 41: 'depends', 42: 'only', 43: 'on', 44: 'the', 45: 'sum', 46: 'of', 47: 'its', 48: 'indices', 49: '.', 50: 

In [None]:
# Model requires entities to be cased to recognise them

## Encoding the definitions

In [15]:
# Dictionary with existing tags - these are to be updated to the new custom tags
dict_file_name = 'preliminary_ner_results.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    preliminary_ner_results = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")

  preliminary_ner_results = dict()

  for key, value in definitions.items():
    text_index = idx2string(value)
    tokenized = tag_tokenizer(value, padding=True, truncation=True)
    input_ids = tokenized['input_ids']
    ner_results = tags(value)
    preliminary_ner_results[key] = {
        "text" : value,
        "text_index": text_index,
        "text_start_end_index": idx2wordpos(value, text_index),
        "tokens" : tokenized.tokens(),
        "baseline_tags" : baseline_tags_list(input_ids, ner_results),
        "input_ids" : input_ids,
        "attention_mask" : tokenized['attention_mask']
        }

  with open(dict_file_name, 'wb') as file:
    pickle.dump(preliminary_ner_results, file)

preliminary_ner_results.pkl not found, creating preliminary_ner_results.pkl...


In [16]:
# Checks
for key in preliminary_ner_results.keys():
  if len(preliminary_ner_results[key]["input_ids"]) != len(preliminary_ner_results[key]["baseline_tags"]):
    print(key)

In [17]:
print(f"dictionary length: {len(preliminary_ner_results)}")

print(f"Pythagorean theorem:")
for k, v in preliminary_ner_results["Pythagorean theorem"].items():
  print(f"{k}:\n   {v}")

dictionary length: 5388
Pythagorean theorem:
text:
   The Pythagorean Theorem describes the relationship between the sides of a right triangle, stating that the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides. This principle is fundamental in geometry and is used to determine distances and measurements in various applications. It highlights the intrinsic connection between the angles and sides of a right triangle.
text_index:
   {0: 'The', 1: 'Pythagorean', 2: 'Theorem', 3: 'describes', 4: 'the', 5: 'relationship', 6: 'between', 7: 'the', 8: 'sides', 9: 'of', 10: 'a', 11: 'right', 12: 'triangle', 13: ',', 14: 'stating', 15: 'that', 16: 'the', 17: 'square', 18: 'of', 19: 'the', 20: 'length', 21: 'of', 22: 'the', 23: 'hypotenuse', 24: 'is', 25: 'equal', 26: 'to', 27: 'the', 28: 'sum', 29: 'of', 30: 'the', 31: 'squares', 32: 'of', 33: 'the', 34: 'lengths', 35: 'of', 36: 'the', 37: 'other', 38: 'two', 39: 'sides', 40: '.', 41:

In [18]:
# Creating an exceptions dictionary with prepositions, conjunctions to bypass in NER
exceptions = ['and', 'or', 'but', 'nor', 'for', 'so', 'yet', 'because', 'although', 'since', 'while', 'if', 'either', 'neither', 'both', 'not', 'only', 'also', 'about', 'above', 'across', 'after',
              'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in',
              'inside', 'into', 'like', 'means', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over', 'past', 'since', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until',
              'up', 'upon', 'with', 'within', 'without']

exception_input_ids = [tag_tokenizer.encode(exception)[1] for exception in exceptions]

## Encoding Named Entity Recognition Dictionary

In [19]:
# Read in mapped dictionary with NER
all_classifications = read_pickle("all_classifications.pkl")

In [20]:
print(f"all_classifications dictionary length: {len(all_classifications)}")
random_sample = random.sample(list(all_classifications.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_classifications dictionary length: 21696
Rabbit sequence:
 {'text_index': {0: 'Rabbit', 1: 'sequence'}, 'tokens': ['[CLS]', 'Rabbit', 'sequence', '[SEP]'], 'input_ids': [101, 17435, 4954, 102], 'ner_mapper': [17435, 4954], 'ner_tags': ['O', 'B-NUMBER-THEORY', 'I-NUMBER-THEORY', 'O']}
Inner product space:
 {'text_index': {0: 'Inner', 1: 'product', 2: 'space'}, 'tokens': ['[CLS]', 'Inner', 'product', 'space', '[SEP]'], 'input_ids': [101, 13310, 3317, 2000, 102], 'ner_mapper': [13310, 3317, 2000], 'ner_tags': ['O', 'B-CALCULUS-AND-ANALYSIS', 'I-CALCULUS-AND-ANALYSIS', 'I-CALCULUS-AND-ANALYSIS', 'O']}
Hereditarily decomposable continuum:
 {'text_index': {0: 'Hereditarily', 1: 'decomposable', 2: 'continuum'}, 'tokens': ['[CLS]', 'Here', '##dit', '##arily', 'de', '##com', '##po', '##sable', 'con', '##tinuum', '[SEP]'], 'input_ids': [101, 3446, 17903, 18206, 1260, 8178, 5674, 17272, 14255, 25379, 102], 'ner_mapper': [3446, 1260, 14255], 'ner_tags': ['O', 'B-TOPOLOGY', 'O', 'O', 'I-TOPOLOG

In [21]:
# Update tags to ignore exceptions such as prepositions and conjunctions
for key, sub_dict in all_classifications.items():
  input_ids = sub_dict['input_ids']
  ner_tags = sub_dict['ner_tags']
  for i in range(len(input_ids)):
    if input_ids[i] in exception_input_ids:
      ner_tags[i] = 'O'
  sub_dict['ner_tags'] = ner_tags

In [22]:
# Trim off input_ids 101 and 102 as these are the start and stop ids
input_id_mapping = dict()

for sub_dict in all_classifications.values():
  key = tuple(sub_dict['input_ids'][1:-1])
  value = sub_dict['ner_tags'][1:-1]
  assert len(key) == len(value), 'Error'
  input_id_mapping.update({key : value})

In [23]:
# Can also match on fuzzy-matched keys between the two eg "pythagoras's theorem" and "Pythagorean theorem"
dict_file_name = 'fuzzy_key_match_results.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    fuzz_dict = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")

  fuzz_dict = dict()
  all_classifications_keys = list(all_classifications.keys())

  for key in preliminary_ner_results.keys():
    my_dict = dict()
    for i in all_classifications_keys:
      fuzz_ratio = fuzz.ratio(key, i)
      if fuzz_ratio > 80 and fuzz_ratio < 100:
        my_dict.update({i : fuzz_ratio})
    fuzz_dict.update({key : my_dict})

  with open(dict_file_name, 'wb') as file:
    pickle.dump(fuzz_dict, file)

In [24]:
print(f"fuzzy_key_match_results dictionary length: {len(fuzz_dict)}")
random_sample = random.sample(list(fuzz_dict.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

fuzzy_key_match_results dictionary length: 5388
Intrinsic Tangent Space:
 {'intrinsic tangent space': 87}
Brachistochrone Problem:
 {'Tautochrone Problem': 81, 'brachistochrone problem': 91}
Klarner's Theorem:
 {"Bauer's Theorem": 81, "klarner's theorem": 88, "Klarner'S Theorem": 94}


In [25]:
# To increase the training dictionary size, get definitions for these fuzzy matches and append them to existing preliminary_ner_results dictionary
get_definition = []

for sub_dict in fuzz_dict.values():
  for key in sub_dict.keys():
    get_definition.append(key)

get_definition = list(set(get_definition))

In [26]:
# Get the API key and set the model name
model = "gpt-4o-mini"
client = OpenAI(api_key=userdata.get('ChatGPT'))

# Function to return answers based on prompt
def gpt_answers(topic):
  completion = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": f"""I will ask you to provide a short summary of a topic, 2 or 3 sentences long. Output must not include mathematical notation or history.
                  Example:

                  Question:
                  Markov chain

                  Answer:
                  A Markov Chain is a process where the next step depends only on the current state, not the path taken to get there. It consists of a set of states and the probabilities of transitioning from one state to another. This makes it a simple way to model systems that evolve step by step with memory only of the present.

                  End of example. """}, # system message that provides context to the model
      {"role": "user", "content": f"""Provide a short summary of: {topic}"""}  # topic query
    ]
  )

  response = completion.choices[0].message.content
  return response

In [27]:
try:
  # Load existing responses if the file exists
  with open('fuzzy_gpt_responses.pkl', 'rb') as file:
    fuzzy_gpt_responses = pickle.load(file)
except (FileNotFoundError, EOFError):
  fuzzy_gpt_responses = dict()

# Iterate over definitions and save each response as it's created
for definition in get_definition:
  if definition not in fuzzy_gpt_responses:
    fuzzy_gpt_responses[definition] = gpt_answers(definition)
    with open('fuzzy_gpt_responses.pkl', 'wb') as file:
      pickle.dump(fuzzy_gpt_responses, file)

In [28]:
print(f"fuzzy_gpt_responses dictionary length: {len(fuzzy_gpt_responses)}")
random_sample = random.sample(list(fuzzy_gpt_responses.items()), 3)

for key, value in random_sample:
  print(f"{key}:\n {value}")

fuzzy_gpt_responses dictionary length: 7512
ramanujan's sum identity:
 Ramanujan's sum identity is a mathematical formula that relates a specific type of sum over the divisors of a number to its divisors' properties. It provides deep insights into number theory and connects to various areas, including modular forms and partitions. This identity showcases the richness of patterns in number theory that Ramanujan explored extensively.
General Differential Geometry:
 General Differential Geometry is a branch of mathematics that studies the properties and structures of differentiable manifolds using the tools of calculus. It explores concepts like curvature, geodesics, and torsion, providing a framework for understanding the geometric properties of spaces that may be curved or non-Euclidean. This field is essential for applications in physics, particularly in general relativity and the study of spacetime.
criss-cross method:
 The criss-cross method is a technique used to determine the formu

In [30]:
# Encode new definitions
dict_file_name = 'ner_fuzzy_encoded.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    ner_fuzzy_encoded = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")
  start_time = time.time()

  ner_fuzzy_encoded = dict()

  for i, (key, value) in enumerate(fuzzy_gpt_responses.items()):
    if key not in ner_fuzzy_encoded:
      text_index = idx2string(value)
      tokenized = tag_tokenizer(value, padding=True, truncation=True)
      input_ids = tokenized['input_ids']
      ner_results = tags(value)
      ner_fuzzy_encoded[key] = {
          "text" : value,
          "text_index": text_index,
          "text_start_end_index": idx2wordpos(value, text_index),
          "tokens" : tokenized.tokens(),
          "baseline_tags" : baseline_tags_list(input_ids, ner_results),
          "input_ids" : input_ids,
          "attention_mask" : tokenized['attention_mask']
          }

    if i % 1000 == 0 or i == len(fuzzy_gpt_responses) - 1:
      print(f"Processed {i+1} definitions, saving to file...")
      end_time = time.time()
      elapsed_time = (end_time - start_time) / 60
      print(f"***Execution time: {elapsed_time:.2f} minutes")
      with open(dict_file_name, 'wb') as file:
        pickle.dump(ner_fuzzy_encoded, file)

ner_fuzzy_encoded.pkl not found, creating ner_fuzzy_encoded.pkl...
Processed 1 definitions, saving to file...
***Execution time: 0.00 minutes
Processed 1001 definitions, saving to file...
***Execution time: 3.45 minutes
Processed 2001 definitions, saving to file...
***Execution time: 6.66 minutes
Processed 3001 definitions, saving to file...
***Execution time: 9.81 minutes
Processed 4001 definitions, saving to file...
***Execution time: 12.95 minutes
Processed 5001 definitions, saving to file...
***Execution time: 16.12 minutes
Processed 6001 definitions, saving to file...
***Execution time: 19.29 minutes
Processed 7001 definitions, saving to file...
***Execution time: 22.44 minutes
Processed 7512 definitions, saving to file...
***Execution time: 24.07 minutes


In [31]:
print(f"ner_fuzzy_encoded dictionary length: {len(ner_fuzzy_encoded)}")
random_sample = random.sample(list(ner_fuzzy_encoded.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

ner_fuzzy_encoded dictionary length: 7512
geometric span:
 {'text': 'Geometric span refers to the smallest geometric figure (such as a polygon or polyhedron) that can encompass a given set of points in space. It represents the extent of the points in terms of their geometric arrangement, providing a way to analyze their distribution and spatial relationships. This concept is often used in fields like computer graphics, optimization, and spatial data analysis.', 'text_index': {0: 'Geometric', 1: 'span', 2: 'refers', 3: 'to', 4: 'the', 5: 'smallest', 6: 'geometric', 7: 'figure', 8: '(', 9: 'such', 10: 'as', 11: 'a', 12: 'polygon', 13: 'or', 14: 'polyhedron', 15: ')', 16: 'that', 17: 'can', 18: 'encompass', 19: 'a', 20: 'given', 21: 'set', 22: 'of', 23: 'points', 24: 'in', 25: 'space', 26: '.', 27: 'It', 28: 'represents', 29: 'the', 30: 'extent', 31: 'of', 32: 'the', 33: 'points', 34: 'in', 35: 'terms', 36: 'of', 37: 'their', 38: 'geometric', 39: 'arrangement', 40: ',', 41: 'providing', 4

In [32]:
# Merge all data into single dictionary
all_data = dict()

for key_1, value_1 in ner_fuzzy_encoded.items():
  all_data.update({key_1 : value_1})

for key_2, value_2 in preliminary_ner_results.items(): #preliminary_ner_results
  if key_2 not in all_data:
    all_data.update({key_2 : value_2})

In [33]:
print(f"all_data dictionary length: {len(all_data)}")
random_sample = random.sample(list(all_data.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_data dictionary length: 11130
Reebless:
 {'text': 'A foliation which has no Reeb components is said to be Reebless', 'text_index': {0: 'A', 1: 'foliation', 2: 'which', 3: 'has', 4: 'no', 5: 'Reeb', 6: 'components', 7: 'is', 8: 'said', 9: 'to', 10: 'be', 11: 'Reebless'}, 'text_start_end_index': {0: (0, 1), 1: (2, 11), 2: (12, 17), 3: (18, 21), 4: (22, 24), 5: (25, 29), 6: (30, 40), 7: (41, 43), 8: (44, 48), 9: (49, 51), 10: (52, 54), 11: (55, 63)}, 'tokens': ['[CLS]', 'A', 'f', '##olia', '##tion', 'which', 'has', 'no', 'Re', '##eb', 'components', 'is', 'said', 'to', 'be', 'Re', '##eb', '##less', '[SEP]'], 'baseline_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'B-MISC', 'O'], 'input_ids': [101, 138, 175, 26578, 2116, 1134, 1144, 1185, 11336, 15581, 5644, 1110, 1163, 1106, 1129, 11336, 15581, 2008, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
algebraic coding theory:
 {'text': 'Algebraic 

In [34]:
def find_sublist_indices(test_list, sub_list):
  n, m = len(test_list), len(sub_list)
  indices = []
  for i in range(n - m + 1):
    if test_list[i:i + m] == sub_list:
      indices.append(list(range(i, i + m)))
  return indices if indices else None  # Return None if no match is found

In [35]:
dict_file_name = 'all_data_matches.pkl'
list_file_name = 'non_matches.pkl'

try:
  with open(dict_file_name, 'rb') as file:
    all_data = pickle.load(file)

except FileNotFoundError:
  print(f"{dict_file_name} not found, creating {dict_file_name}...")
  start_time = time.time()

  non_matches = set()

  for i, (key, sub_dict) in enumerate(all_data.items()):
    all_data_input_ids = sub_dict['input_ids']
    updated_tags = sub_dict['baseline_tags'].copy()
    # Only keep existing B, I-LOC and B, I-ORG tags
    updated_tags = ['O' if tag not in ['B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'] else tag for tag in updated_tags]
    my_list = []

    for classification_tuple, classification_ner in input_id_mapping.items():
      to_find = list(classification_tuple)
      idx_result = find_sublist_indices(all_data_input_ids, to_find)

      if idx_result:
        idx_list = [i for item in idx_result for i in item]
        my_list.append(idx_list)
        if len(idx_list) > len(classification_ner):
          classification_ner_list = list(len(idx_list)*classification_ner)
        if len(idx_list) == len(classification_ner):
          classification_ner_list = classification_ner
        if len(idx_list) < len(classification_ner):
          print(f"Error")
        for idx, classification in zip(idx_list, classification_ner):
          updated_tags[idx] = classification
      else:
        non_matches.add(key)
    ner_indices = [i for item in my_list for i in item]
    all_data[key]['ner_tags'] = updated_tags
    all_data[key]['ner_indices'] = ner_indices

    if i % 1000 == 0 or i == len(all_data) - 1:
      print(f"Processed {i+1} of {len(all_data)}, saving to file...")
      end_time = time.time()
      elapsed_time = (end_time - start_time)/60
      print(f" ***Execution time: {elapsed_time:.2f} minutes")
      with open(dict_file_name, 'wb') as file:
        pickle.dump(all_data, file)
      with open(list_file_name, 'wb') as file:
        pickle.dump(non_matches, file)

all_data_matches.pkl not found, creating all_data_matches.pkl...
Processed 1 of 11130, saving to file...
 ***Execution time: 0.01 minutes
Processed 1001 of 11130, saving to file...
 ***Execution time: 5.80 minutes
Processed 2001 of 11130, saving to file...
 ***Execution time: 11.60 minutes
Processed 3001 of 11130, saving to file...
 ***Execution time: 17.53 minutes
Processed 4001 of 11130, saving to file...
 ***Execution time: 23.55 minutes
Processed 5001 of 11130, saving to file...
 ***Execution time: 29.30 minutes
Processed 6001 of 11130, saving to file...
 ***Execution time: 35.07 minutes
Processed 7001 of 11130, saving to file...
 ***Execution time: 40.79 minutes
Processed 8001 of 11130, saving to file...
 ***Execution time: 46.86 minutes
Processed 9001 of 11130, saving to file...
 ***Execution time: 53.12 minutes
Processed 10001 of 11130, saving to file...
 ***Execution time: 59.46 minutes
Processed 11001 of 11130, saving to file...
 ***Execution time: 66.21 minutes
Processed 1113

In [37]:
print(f"all_data dictionary length: {len(all_data)}")
random_sample = random.sample(list(all_data.items()), 1)

for key, value in random_sample:
  print(f"{key}:\n {value}")

all_data dictionary length: 11130
parabola:
 {'text': 'A parabola is a symmetrical, U-shaped curve that is defined as the set of points equidistant from a fixed point called the focus and a fixed line known as the directrix. It is a conic section that can open upwards, downwards, left, or right, and is often described by a quadratic equation. Parabolas have applications in various fields, including physics, engineering, and computer graphics, particularly in projectile motion and the design of reflective surfaces.', 'text_index': {0: 'A', 1: 'parabola', 2: 'is', 3: 'a', 4: 'symmetrical', 5: ',', 6: 'U', 7: '-', 8: 'shaped', 9: 'curve', 10: 'that', 11: 'is', 12: 'defined', 13: 'as', 14: 'the', 15: 'set', 16: 'of', 17: 'points', 18: 'equidistant', 19: 'from', 20: 'a', 21: 'fixed', 22: 'point', 23: 'called', 24: 'the', 25: 'focus', 26: 'and', 27: 'a', 28: 'fixed', 29: 'line', 30: 'known', 31: 'as', 32: 'the', 33: 'directrix', 34: '.', 35: 'It', 36: 'is', 37: 'a', 38: 'conic', 39: 'section

In [41]:
for_df = ['tokens', 'input_ids', 'baseline_tags', 'ner_tags', 'attention_mask']
my_list = []
for i in for_df:
  my_list.append(pd.DataFrame(all_data[random_sample[0][0]][i], columns=[i]))

df = pd.concat(my_list, axis=1)
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102
tokens,[CLS],A,para,##bol,##a,is,a,symmetrical,",",U,-,shaped,curve,that,is,defined,as,the,set,of,points,e,##quid,##istan,##t,from,a,fixed,point,called,the,focus,and,a,fixed,line,known,as,the,direct,##rix,.,It,is,a,con,##ic,section,that,can,open,upwards,",",downward,##s,",",left,",",or,right,",",and,is,often,described,by,a,q,##uad,##ratic,equation,.,Para,##bol,##as,have,applications,in,various,fields,",",including,physics,",",engineering,",",and,computer,graphics,",",particularly,in,project,##ile,motion,and,the,design,of,reflective,surfaces,.,[SEP]
input_ids,101,138,18311,15792,1161,1110,170,26795,117,158,118,4283,7660,1115,1110,3393,1112,1103,1383,1104,1827,174,24235,20300,1204,1121,170,4275,1553,1270,1103,2817,1105,170,4275,1413,1227,1112,1103,2904,14799,119,1135,1110,170,14255,1596,2237,1115,1169,1501,15726,117,14833,1116,117,1286,117,1137,1268,117,1105,1110,1510,1758,1118,170,186,18413,21961,8381,119,23994,15792,2225,1138,4683,1107,1672,3872,117,1259,7094,117,3752,117,1105,2775,9043,117,2521,1107,1933,4759,4018,1105,1103,1902,1104,24449,9902,119,102
baseline_tags,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
ner_tags,O,O,B-ALGEBRA,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-GEOMETRY,B-NUMBER-THEORY,O,O,O,O,O,B-CALCULUS-AND-ANALYSIS,I-CALCULUS-AND-ANALYSIS,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-TOPOLOGY,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-ALGEBRA,O,O,I-ALGEBRA,O,O,O,O,O,O,O,O,O,O,O,O,O,B-APPLIED-MATHEMATICS,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-GEOMETRY,O,O
attention_mask,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
