In [None]:
# Setup
! pip install seqeval evaluate
! pip install kaleido

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess 

In [None]:
# Library imports
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoConfig, DistilBertForTokenClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.tokenization_utils_base import BatchEncoding
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px
from seqeval.metrics import classification_report
import evaluate
import pprint
import kaleido

In [None]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [None]:
# View all pandas columns, rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
metric = evaluate.load('seqeval')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [None]:
# Read in dictionary
all_data = read_pickle('all_data_matches.pkl')
all_data_amended = all_data.copy()

In [None]:
random_sample = random.sample(list(all_data_amended.items()), 3)

my_list = []
key, sub_dict = random_sample[0]
my_list.append(sub_dict['tokens'])
my_list.append(sub_dict['baseline_tags'])
my_list.append(sub_dict['input_ids'])
my_list.append(sub_dict['ner_tags'])
print(f"Key: {key}")
print(f"Text: {sub_dict['text']}")

pd.DataFrame(my_list, index=["Tokens", "Baseline_Tags", "Input_ids", "NER_Tags"])

Key: Postnikov System
Text: The Postnikov System is a concept in algebraic topology that organizes topological spaces into a series of stages based on their homotopy type. It decomposes a space into simplicial complexes, allowing for an analysis of its higher homotopy groups in a systematic way. This method facilitates the study of complex topological properties by breaking them down into more manageable components.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80
Tokens,[CLS],The,Post,##nikov,System,is,a,concept,in,algebraic,topology,that,organizes,top,##ological,spaces,into,a,series,of,stages,based,on,their,ho,##moto,##py,type,.,It,de,##com,##pose,##s,a,space,into,si,##mp,##lic,##ial,complexes,",",allowing,for,an,analysis,of,its,higher,ho,##moto,##py,groups,in,a,systematic,way,.,This,method,facilitate,##s,the,study,of,complex,top,##ological,properties,by,breaking,them,down,into,more,manage,##able,components,.,[SEP]
Baseline_Tags,O,O,B-MISC,B-MISC,I-MISC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
Input_ids,101,1109,3799,28036,3910,1110,170,3400,1107,19669,22411,1115,24532,1499,7542,6966,1154,170,1326,1104,5251,1359,1113,1147,16358,12610,5005,2076,119,1135,1260,8178,14811,1116,170,2000,1154,27466,8223,8031,2916,16575,117,3525,1111,1126,3622,1104,1157,2299,16358,12610,5005,2114,1107,170,12818,1236,119,1188,3442,11000,1116,1103,2025,1104,2703,1499,7542,4625,1118,4440,1172,1205,1154,1167,5494,1895,5644,119,102
NER_Tags,IGN,O,B-TOPOLOGY,IGN,E-TOPOLOGY,O,O,S-DISCRETE-MATHEMATICS,O,B-TOPOLOGY,E-TOPOLOGY,O,O,O,O,S-TOPOLOGY,O,O,S-CALCULUS-AND-ANALYSIS,O,O,O,O,O,B-TOPOLOGY,IGN,IGN,E-TOPOLOGY,O,O,O,O,O,O,O,S-TOPOLOGY,O,O,O,O,O,O,O,O,O,O,S-CALCULUS-AND-ANALYSIS,O,O,O,O,O,O,S-ALGEBRA,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,IGN


In [None]:
# Finetune definitions
# dict_file_name = 'train_definitions.pkl'
# train_definitions = read_pickle(dict_file_name)

# key, sub_dict = random_sample[0]
# # all_data_amended["Kobon Triangle"]['ner_tags'][68] = 'O'
# # all_data_amended["Kobon Triangle"]['ner_tags'][69] = 'O'

# # train_definitions.update({key: all_data_amended[key]})

# dict_file_name = 'train_definitions.pkl'
# with open(dict_file_name, 'wb') as file:
#   pickle.dump(train_definitions, file)

# # Train definitions has the hand-labelled definitions
# train_definitions = read_pickle(dict_file_name)
# train_definitions.keys()

## Class distribution by main tag

In [None]:
# Function to count the tag definition per tag-list
def returns_count_per_class(ner_tag_list):
  my_dict = {}
  for tag in ner_tag_list:
    if tag not in ['O', 'IGN']:
      tag = '-'.join(tag.split('-')[1:])
    if tag not in my_dict:
      my_dict[tag] = 1
    else:
      my_dict[tag] += 1
  sorted_data = dict(sorted(my_dict.items(), key=lambda item: item[1], reverse=True))
  return sorted_data

In [None]:
# Function to summarise main tags
def returns_main_tags(all_data_amended):
  name, count_list, no_tag_list = [], [], []

  for key, sub_dict in all_data_amended.items():
    my_list, my_list_i = [],[]
    to_count = sub_dict['ner_tags']
    count = returns_count_per_class(to_count)

    for i, (k, v) in enumerate(count.items()):
      if k in ['O', 'IGN']:
        continue
      else:
        my_list.append(k)
        my_list_i.append(key)
    try:
      count_list.append(my_list[0])
      name.append(my_list_i[0])
    except:
      no_tag_list.append(key)
      continue

  for_df = {'name': name, 'main_concept': count_list}
  df = pd.DataFrame(for_df)
  summary = df[['main_concept']].groupby('main_concept').value_counts().reset_index(drop=False)
  summary = summary.sort_values(by="count", ascending=False).reset_index(drop=True)
  return summary, df, no_tag_list

In [None]:
summary, label_df, no_tag_list = returns_main_tags(all_data_amended)

for tag in no_tag_list:
  try:
    del all_data_amended[tag]
  except:
    continue

assert len(all_data_amended) == label_df.shape[0], 'Error'

In [None]:
fig = px.bar(summary, x='main_concept', y='count', hover_data=['main_concept', 'count'], text='count', color='count', height=500, color_continuous_scale='Viridis', title="Distribution by primary classification label")
fig.update_coloraxes(showscale=False)
fig.show()

### Merging labels together

In [None]:
# Condense matrix classifications to single classification = "MATRICES"

b_tags = ["B-MATRIX-DECOMPOSITION", "B-MATRIX-EIGENVALUES", "B-MATRIX-GROUPS", "B-MATRIX-INVERSION", "B-MATRIX-NORMS", "B-MATRIX-OPERATIONS", "B-MATRIX-PROPERTIES", "B-MATRIX-TYPES", "B-INTEGER-MATRICES"]
i_tags = ["I-MATRIX-DECOMPOSITION","I-MATRIX-EIGENVALUES","I-MATRIX-GROUPS","I-MATRIX-INVERSION","I-MATRIX-NORMS","I-MATRIX-OPERATIONS","I-MATRIX-PROPERTIES","I-MATRIX-TYPES", "I-INTEGER-MATRICES"]
e_tags = ["E-MATRIX-DECOMPOSITION","E-MATRIX-EIGENVALUES","E-MATRIX-GROUPS","E-MATRIX-INVERSION","E-MATRIX-NORMS","E-MATRIX-OPERATIONS","E-MATRIX-PROPERTIES","E-MATRIX-TYPES", "E-INTEGER-MATRICES"]
s_tags = ["S-MATRIX-DECOMPOSITION","S-MATRIX-EIGENVALUES","S-MATRIX-GROUPS","S-MATRIX-INVERSION","S-MATRIX-NORMS","S-MATRIX-OPERATIONS","S-MATRIX-PROPERTIES","S-MATRIX-TYPES", "S-INTEGER-MATRICES"]
lie_b_tags = ["B-LIE-GROUPS", "B-LIE-THEORY"]
lie_i_tags = ["I-LIE-GROUPS", "I-LIE-THEORY"]
lie_e_tags = ["E-LIE-GROUPS", "E-LIE-THEORY"]
lie_s_tags = ["S-LIE-GROUPS", "S-LIE-THEORY"]

for i, (key, sub_dict) in enumerate(all_data_amended.items()):
  ner_tags = sub_dict['ner_tags']
  for ner_tag in ner_tags:
    if ner_tag in b_tags:
      ner_tags[ner_tags.index(ner_tag)] = "B-MATRICES"
    elif ner_tag in i_tags:
      ner_tags[ner_tags.index(ner_tag)] = "I-MATRICES"
    elif ner_tag in e_tags:
      ner_tags[ner_tags.index(ner_tag)] = "E-MATRICES"
    elif ner_tag in s_tags:
      ner_tags[ner_tags.index(ner_tag)] = "S-MATRICES"
    elif ner_tag in lie_b_tags:
      ner_tags[ner_tags.index(ner_tag)] = "B-LIE-ALGEBRA"
    elif ner_tag in lie_i_tags:
      ner_tags[ner_tags.index(ner_tag)] = "I-LIE-ALGEBRA"
    elif ner_tag in lie_e_tags:
      ner_tags[ner_tags.index(ner_tag)] = "E-LIE-ALGEBRA"
    elif ner_tag in lie_s_tags:
      ner_tags[ner_tags.index(ner_tag)] = "S-LIE-ALGEBRA"
  all_data_amended[key]['ner_tags'] = ner_tags

summary, label_df, no_tag_list = returns_main_tags(all_data_amended)
label_df = label_df.reset_index(drop=False)
fig = px.bar(summary, x='main_concept', y='count', hover_data=['main_concept', 'count'], text='count', color='count', height=500, color_continuous_scale='Viridis', title="Distribution by primary classification label")
fig.update_coloraxes(showscale=False)
fig.update_layout(autosize=False, width=1600, height=700, margin=dict( l=50, r=50, b=100, t=100, pad=4 ), font=dict(size=12))
fig.write_image("/content/drive/MyDrive/Colab Notebooks/Math_Graph/Images/fig1.png")
fig.show()

In [None]:
# Check order of label_df same as all_data_amended (NB for indices for train-test split)
for i, key in enumerate(list(all_data_amended.keys())):
  assert label_df.iloc[i]['name'] == key, 'Error'

label_df.head()

Unnamed: 0,index,name,main_concept
0,0,ludwig's inversion formula,CALCULUS-AND-ANALYSIS
1,1,quotient,NUMBER-THEORY
2,2,survivorship curve,APPLIED-MATHEMATICS
3,3,inadmissible,DISCRETE-MATHEMATICS
4,4,Natural Logarithm of 2,DISCRETE-MATHEMATICS


## Train-test stratified split

In [None]:
X_train_indices, X_test_indices, y_train_indices, y_test_indices = train_test_split(label_df['index'].to_numpy(), label_df['main_concept'].to_numpy(),
                                                                                    test_size=0.3, random_state=42, stratify=label_df['main_concept'].to_numpy())

# Repeat to get validation sub-sample of Train
X_train_indices, X_valid_indices, y_train_indices, y_valid_indices = train_test_split(X_train_indices, y_train_indices, test_size=0.3, random_state=42, stratify=y_train_indices)

In [None]:
# Check the resulting distributions
print("Class distribution in original dataset:")
print(label_df['main_concept'].value_counts(normalize=True))

print("\nClass distribution in train dataset:")
print(pd.Series(y_train_indices).value_counts(normalize=True))

print("\nClass distribution in validation dataset:")
print(pd.Series(y_valid_indices).value_counts(normalize=True))

Class distribution in original dataset:
main_concept
CALCULUS-AND-ANALYSIS          0.215153
NUMBER-THEORY                  0.145139
TOPOLOGY                       0.108790
ALGEBRA                        0.105596
GEOMETRY                       0.091159
DISCRETE-MATHEMATICS           0.087070
PROBABILITY-AND-STATISTICS     0.078510
APPLIED-MATHEMATICS            0.041204
MATRICES                       0.040437
RECREATIONAL-MATHEMATICS       0.035901
FOUNDATIONS-OF-MATHEMATICS     0.032196
LINEAR-ALGEBRA                 0.004855
LIE-ALGEBRA                    0.004088
ORG                            0.003322
DETERMINANTS                   0.003066
LOC                            0.002236
LINEAR-SYSTEMS-OF-EQUATIONS    0.001278
Name: proportion, dtype: float64

Class distribution in train dataset:
CALCULUS-AND-ANALYSIS          0.215152
NUMBER-THEORY                  0.145130
TOPOLOGY                       0.108750
ALGEBRA                        0.105620
GEOMETRY                       0.091