<a href="https://colab.research.google.com/github/HeatherDriver/MathGraph/blob/main/05_NER_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoConfig, DistilBertForTokenClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [56]:
# Read in dictionary
all_data = read_pickle('all_data_matches.pkl')
all_data_amended = all_data.copy()

In [57]:
tag_list = []
for key, sub_dict in all_data.items():
  ner_tags = sub_dict['ner_tags']
  for tag in ner_tags:
    if tag not in tag_list:
      tag_list.append(tag)

index2tag = {idx:tag for idx, tag in enumerate(tag_list)}
tag2index = {tag:idx for idx, tag in enumerate(tag_list)}

index2tag[-100] = 'O'
del index2tag[0]

tag2index['O'] = -100

In [58]:
my_list = []
for i in tag_list:
  if i == 'O':
    my_list.append(i)
  else:
    a = i.split('-')
    b = '-'.join(a[1:])
    if b not in my_list:
      my_list.append(b)

my_list.sort()
for i, entry in enumerate(my_list):
  print(f"{i}: {entry}")

0: ALGEBRA
1: APPLIED-MATHEMATICS
2: CALCULUS-AND-ANALYSIS
3: DETERMINANTS
4: DISCRETE-MATHEMATICS
5: FOUNDATIONS-OF-MATHEMATICS
6: GEOMETRY
7: INTEGER-MATRICES
8: LIE-ALGEBRA
9: LIE-GROUPS
10: LIE-THEORY
11: LINEAR-ALGEBRA
12: LINEAR-INDEPENDENCE
13: LINEAR-SYSTEMS-OF-EQUATIONS
14: LOC
15: MATRICES
16: MATRIX-DECOMPOSITION
17: MATRIX-EIGENVALUES
18: MATRIX-GROUPS
19: MATRIX-INVERSION
20: MATRIX-NORMS
21: MATRIX-OPERATIONS
22: MATRIX-PROPERTIES
23: MATRIX-TYPES
24: NUMBER-THEORY
25: O
26: ORG
27: PERMANENTS
28: PROBABILITY-AND-STATISTICS
29: RECREATIONAL-MATHEMATICS
30: TOPOLOGY


In [None]:
# 30 categories with higher granularity for linear algebra topics eg matrices and matrix operations

In [75]:
random_sample = random.sample(list(all_data_amended.items()), 3)

my_list = []
key, sub_dict = random_sample[0]
my_list.append(sub_dict['tokens'])
my_list.append(sub_dict['baseline_tags'])
my_list.append(sub_dict['input_ids'])
my_list.append(sub_dict['ner_tags'])
print(f"Key: {key}")
print(f"Text: {sub_dict['text']}")

pd.DataFrame(my_list, index=["Tokens", "Baseline_Tags", "Input_ids", "NER_Tags"])

Key: point-quadratic distance
Text: Point-quadratic distance refers to a measure of how far a point is from a given quadratic curve or surface. It is calculated by determining the shortest distance between the point and the curve, often involving finding the minimum of a quadratic function that represents the distance. This concept is useful in fields such as optimization, computer graphics, and machine learning for tasks like shape recognition and fitting models to data.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85
Tokens,[CLS],Point,-,q,##uad,##ratic,distance,refers,to,a,measure,of,how,far,a,point,is,from,a,given,q,##uad,##ratic,curve,or,surface,.,It,is,calculated,by,determining,the,shortest,distance,between,the,point,and,the,curve,",",often,involving,finding,the,minimum,of,a,q,##uad,##ratic,function,that,represents,the,distance,.,This,concept,is,useful,in,fields,such,as,optimization,",",computer,graphics,",",and,machine,learning,for,tasks,like,shape,recognition,and,fitting,models,to,data,.,[SEP]
Baseline_Tags,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
Input_ids,101,4221,118,186,18413,21961,2462,4431,1106,170,4929,1104,1293,1677,170,1553,1110,1121,170,1549,186,18413,21961,7660,1137,2473,119,1135,1110,10056,1118,13170,1103,22710,2462,1206,1103,1553,1105,1103,7660,117,1510,5336,4006,1103,5867,1104,170,186,18413,21961,3053,1115,5149,1103,2462,119,1188,3400,1110,5616,1107,3872,1216,1112,25161,117,2775,9043,117,1105,3395,3776,1111,8249,1176,3571,4453,1105,11732,3584,1106,2233,119,102
NER_Tags,O,B-GEOMETRY,I-GEOMETRY,I-GEOMETRY,O,O,I-GEOMETRY,O,O,O,B-CALCULUS-AND-ANALYSIS,O,O,O,O,B-GEOMETRY,O,O,O,O,B-ALGEBRA,O,O,I-ALGEBRA,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-CALCULUS-AND-ANALYSIS,O,O,O,O,O,O,B-DISCRETE-MATHEMATICS,O,O,O,O,O,O,B-APPLIED-MATHEMATICS,O,O,O,O,O,B-DISCRETE-MATHEMATICS,O,O,O,O,O,O,O,O,O,O,O,O,O


In [73]:
# Finetune definitions
dict_file_name = 'train_definitions.pkl'
train_definitions = read_pickle(dict_file_name)

key, sub_dict = random_sample[0]
# all_data_amended["Skolem-Mahler-Lech Theorem"]['ner_tags'][31] = 'B-DISCRETE-MATHEMATICS'

# train_definitions.update({key: all_data_amended[key]})

dict_file_name = 'train_definitions.pkl'
with open(dict_file_name, 'wb') as file:
  pickle.dump(train_definitions, file)

# Train definitions has the hand-labelled definitions
train_definitions = read_pickle(dict_file_name)
train_definitions.keys()

In [90]:
# Function to count the tag definition per tag-list
def returns_count_per_class(ner_tag_list):
  my_dict = {}
  for tag in ner_tag_list:
    if tag != 'O':
      tag = '-'.join(tag.split('-')[1:])
    if tag not in my_dict:
      my_dict[tag] = 1
    else:
      my_dict[tag] += 1
  sorted_data = dict(sorted(my_dict.items(), key=lambda item: item[1], reverse=True))
  return sorted_data

In [91]:
for key, sub_dict in train_definitions.items():
  to_count = sub_dict['ner_tags']
  count = returns_count_per_class(to_count)
  print(f"{key}: {count}")

Simpson's Paradox: {'O': 73, 'PROBABILITY-AND-STATISTICS': 8, 'FOUNDATIONS-OF-MATHEMATICS': 6, 'ALGEBRA': 1}
Gram-Schmidt Process: {'O': 90, 'CALCULUS-AND-ANALYSIS': 16, 'LINEAR-ALGEBRA': 5, 'TOPOLOGY': 2, 'RECREATIONAL-MATHEMATICS': 1, 'NUMBER-THEORY': 1}
Cylindrical Parts: {'O': 242, 'ALGEBRA': 7, 'NUMBER-THEORY': 6, 'CALCULUS-AND-ANALYSIS': 2, 'DISCRETE-MATHEMATICS': 1}
voter model: {'O': 60, 'APPLIED-MATHEMATICS': 2, 'PROBABILITY-AND-STATISTICS': 1}
Skolem-Mahler-Lech Theorem: {'O': 70, 'NUMBER-THEORY': 8, 'DISCRETE-MATHEMATICS': 7, 'FOUNDATIONS-OF-MATHEMATICS': 3, 'CALCULUS-AND-ANALYSIS': 1, 'APPLIED-MATHEMATICS': 1, 'RECREATIONAL-MATHEMATICS': 1}


In [36]:
# Fine-tuning DistilBERT
tag_checkpoint = "dslim/distilbert-NER"
tag_tokenizer = AutoTokenizer.from_pretrained(tag_checkpoint, do_lower_case=False)
tag_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)

tags = pipeline("ner", model=tag_model, tokenizer=tag_tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class DistilBertForTokenClassification(DistilBertPreTrainedModel):

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels

    # Model body
    self.distilbert = DistilBertModel(config)

    # Classification head
    self.dropout = nn.Dropout(config.dropout)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # Initialise weights
    self.init_weights()

  def forward_pass(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
    encoded_outputs = self.distilbert( input_ids, attention_mask=attention_mask, **kwargs)
    sequence_output = self.dropout(encoded_outputs[0])
    logits = self.classifier(sequence_output)

    # Loss calc
    loss=None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    final_output = TokenClassifierOutput(loss=loss, logits=logits, hidden_states=encoded_outputs.hidden_states, attentions=encoded_outputs.attentions)
    return final_output

In [None]:
# Update the existing config to the settings for custom DistilBERT
config = AutoConfig.from_pretrained(tag_checkpoint, num_labels=len(index2tag), label2id=tag2index, id2label=index2tag,)
config.label2id = tag2index
config.id2label = index2tag
config.num_labels = len(index2tag)

# Initialise custom DistilBERT using these changed configurations
tag_model_custom = DistilBertForTokenClassification(config)

In [None]:
# Load the pretrained state_dict
pretrained_model = AutoModelForTokenClassification.from_pretrained(tag_checkpoint)
tag_model_custom.distilbert.load_state_dict(pretrained_model.distilbert.state_dict(), strict=False)

tag_model_custom.classifier = nn.Linear(config.hidden_size, config.num_labels)
tag_model_custom.init_weights()

tag_model_custom.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
inputs = tag_tokenizer("A Banach space is a complete normed vector space, meaning it is a vector space equipped with a norm that allows for the measurement of vector lengths and satisfies certain conditions for completeness.", return_tensors="pt")
outputs = tag_model_custom.forward_pass(**inputs).logits
predictions = torch.argmax(outputs, dim=-1)

print(f"Shape of outputs: {outputs.shape}")

Shape of outputs: torch.Size([1, 42, 426])


In [None]:
preds = [index2tag[p] for p in predictions[0].cpu().numpy()]
tokens = tag_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0].numpy())
pd.DataFrame([inputs['input_ids'][0].numpy(), tokens, predictions[0].numpy(), preds], index=["Input_ids", "Tokens", "Baseline_ID","Baseline_Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
Input_ids,101,138,18393,7291,2000,1110,170,2335,18570,1174,9479,2000,117,2764,1122,1110,170,9479,2000,5440,1114,170,18570,1115,3643,1111,1103,11842,1104,9479,10707,1105,2068,1548,16847,2218,2975,1111,2335,1757,119,102
Tokens,[CLS],A,Ban,##ach,space,is,a,complete,norm,##ed,vector,space,",",meaning,it,is,a,vector,space,equipped,with,a,norm,that,allows,for,the,measurement,of,vector,lengths,and,sat,##is,##fies,certain,conditions,for,complete,##ness,.,[SEP]
Baseline_ID,8,8,328,274,336,336,336,8,373,336,178,336,336,175,336,336,336,336,336,398,336,336,336,336,165,336,336,215,336,336,336,336,336,336,336,336,336,336,336,178,336,336
Baseline_Tags,B_ROUNDING,B_ROUNDING,B_MATRIX_DECOMPOSITION,B_TRANSCENDENTAL_NUMBERS,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,B_ROUNDING,B_RATE_PROBLEMS,I_BINARY_SEQUENCES,I_COMPLEX_ANALYSIS,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,B_ALGEBRAIC_EQUATIONS,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,B_PROJECTIVE_GEOMETRY,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,B_TOPOLOGICAL_OPERATIONS,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,B_REGRESSION,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES,I_COMPLEX_ANALYSIS,I_BINARY_SEQUENCES,I_BINARY_SEQUENCES


In [None]:
# %cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph'
# tag_model_custom.save_pretrained("math_ner_model")
# tag_tokenizer.save_pretrained("math_ner_model")