In [None]:
# Library imports
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px
import evaluate
import pprint

In [None]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [None]:
# View all pandas columns, rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

# Define file write function
def write_pickle(dict_file, dict_file_name):
  dict_file_name = dict_file_name + '.pkl'
  with open(dict_file_name, 'wb') as file:
    pickle.dump(dict_file, file)

In [None]:
# Read in dictionary
all_data_amended = read_pickle('all_data_amended.pkl')

In [None]:
my_list = ['DETERMINANTS', 'LINEAR-ALGEBRA', 'LINEAR-INDEPENDENCE', 'LINEAR-SYSTEMS-OF-EQUATIONS', 'MATRICES', 'PERMANENTS']
lin_alg = dict()
for key, sub_dict in all_data_amended.items():
  for item in my_list:
    for tag in sub_dict['ner_tags']:
      if item in tag:
        if key not in lin_alg:
          lin_alg[key] = sub_dict

In [None]:
len(lin_alg.keys())

1583

In [None]:
my_list = []
key = "Perron's theorem"
sub_dict = lin_alg[key]
my_list.append(sub_dict['tokens'])
my_list.append(sub_dict['baseline_tags'])
my_list.append(sub_dict['input_ids'])
my_list.append(sub_dict['ner_tags'])
print(f"Key: {key}")
print(f"Text: {sub_dict['text']}")

pd.DataFrame(my_list, index=["Tokens", "Baseline_Tags", "Input_ids", "NER_Tags"])

Key: Perron's theorem
Text: Perron's theorem pertains to the eigenvalues of non-negative matrices and states that under certain conditions, the largest eigenvalue (Perron-Frobenius eigenvalue) is real and positive. It also guarantees the existence of a corresponding positive eigenvector, which reflects the behavior of the system modeled by the matrix. This theorem is particularly useful in various fields, including economics and population dynamics, where growth processes are represented.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
Tokens,[CLS],Per,##ron,',s,theorem,per,##tains,to,the,e,##ige,##n,##val,##ues,of,non,-,negative,matrices,and,states,that,under,certain,conditions,",",the,largest,e,##ige,##n,##val,##ue,(,Per,##ron,-,Fr,##obe,##nius,e,##ige,##n,##val,##ue,),is,real,and,positive,.,It,also,guarantees,the,existence,of,a,corresponding,positive,e,##ige,##n,##ve,##ctor,",",which,reflects,the,behavior,of,the,system,modeled,by,the,matrix,.,This,theorem,is,particularly,useful,in,various,fields,",",including,economics,and,population,dynamics,",",where,growth,processes,are,represented,.,[SEP]
Baseline_Tags,O,B-PER,B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-MISC,B-MISC,B-MISC,B-MISC,B-MISC,B-MISC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
Input_ids,101,14286,3484,112,188,10384,1679,22748,1106,1103,174,13417,1179,7501,10589,1104,1664,118,4366,24350,1105,2231,1115,1223,2218,2975,117,1103,2026,174,13417,1179,7501,4175,113,14286,3484,118,13359,21367,19206,174,13417,1179,7501,4175,114,1110,1842,1105,3112,119,1135,1145,24512,1103,3796,1104,170,7671,3112,174,13417,1179,2707,9363,117,1134,11363,1103,4658,1104,1103,1449,15320,1118,1103,8952,119,1188,10384,1110,2521,5616,1107,1672,3872,117,1259,8142,1105,1416,14189,117,1187,3213,5669,1132,2533,119,102
NER_Tags,IGN,B-MATRICES,IGN,I-MATRICES,I-MATRICES,E-MATRICES,O,O,O,O,S-NUMBER-THEORY,O,O,O,O,O,O,O,O,S-ALGEBRA,O,O,O,O,O,O,O,O,O,S-MATRICES,IGN,IGN,IGN,IGN,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,S-MATRICES,IGN,IGN,IGN,IGN,O,O,O,O,O,O,O,O,O,O,O,S-MATRICES,O,O,O,O,O,O,O,O,O,O,O,S-APPLIED-MATHEMATICS,O,B-APPLIED-MATHEMATICS,E-APPLIED-MATHEMATICS,O,O,S-APPLIED-MATHEMATICS,O,O,O,O,IGN


In [None]:
for key, sub_dict in lin_alg.items():
  token_list = sub_dict['tokens']
  ner_tag_list = sub_dict['ner_tags']
  for i, (token, tag) in enumerate(zip(token_list, ner_tag_list)):
    if token == 'sums' and tag == 'S-ALGEBRA':
      ner_tag_list[i] = 'O'
    if token == 'zero' and tag == 'S-NUMBER-THEORY':
      ner_tag_list[i] = 'O'
    if token == 'least' and tag == 'I-RECREATIONAL-MATHEMATICS':
      ner_tag_list[i] = 'O'
    if token == 'one' and tag == 'E-RECREATIONAL-MATHEMATICS':
      ner_tag_list[i] = 'O'
    if token == 'concept' and tag == 'S-DISCRETE-MATHEMATICS':
      ner_tag_list[i] = 'O'
    if token == 'forms' and tag == 'S-ALGEBRA':
      ner_tag_list[i] = 'O'
    if token == 'points' and tag == 'S-GEOMETRY':
      ner_tag_list[i] = 'O'
    if token == 'physics' and tag == 'O':
      ner_tag_list[i] = 'S-APPLIED-MATHEMATICS'
    if token == 'minor' and tag == 'S-DETERMINANTS':
      ner_tag_list[i] = 'O'

In [None]:
def replaces_in_lists(input_id_list, tag_list, sub_dict):
  """
    Replaces entries in sub_dict['ner_tags'] if input_id_list appears as a contiguous subsequence
    in sub_dict['input_ids']. Replaces all occurrences instead of stopping after the first match.
  """
  input_ids = sub_dict['input_ids']
  ner_tags = sub_dict['ner_tags']
  input_len = len(input_id_list)

  i = 0
  while i <= len(input_ids) - input_len:
    if input_ids[i:i+input_len] == input_id_list:
      # Found a perfect match, replace corresponding ner_tags
      ner_tags[i:i+input_len] = tag_list
      i += input_len  # Move past this match to avoid overlapping replacements
    else:
      i += 1  # Continue searching

  return sub_dict  # Return updated dictionary

In [None]:
eigenvector_input = [174,	13417,	1179,	2707,	22711]
eigenvector_tags = ['S-MATRICES',	'IGN',	'IGN',	'IGN',	'IGN']
eigenvalue_input = [174,	13417,	1179,	7501,	10589]
eigenvalue_tags = ['S-MATRICES',	'IGN',	'IGN',	'IGN',	'IGN']
hermitian_input = [1430,	9084,	1811,	24350]
hermitian_tags = ['B-MATRICES',	'IGN',	'IGN',	'E-MATRICES']
normal_input = [14508,	24350]
normal_tags = ['B-MATRICES',	'E-MATRICES']
alg_input = [19669,	4226]
alg_tags = ['B-CALCULUS-AND-ANALYSIS',	'E-CALCULUS-AND-ANALYSIS']
alg_input_i = [19669,	11838]
alg_tags_i = ['B-ALGEBRA',	'E-ALGEBRA']
scalar_input = [188, 7867,	1813]
scalar_tags = ['S-ALGEBRA',	'IGN',	'IGN']
eigenvector_input_i = [174,	13417,	1179,	2707,	9363]
eigenvector_tags_i = ['S-MATRICES',	'IGN',	'IGN',	'IGN',	'IGN']
linear_trans_input = [7378,	26139]
linear_trans_tags = ['B-LINEAR-ALGEBRA',	'E-LINEAR-ALGEBRA']
systems_lin_eq_input = [2344,	1104,	7378,	11838]
systems_lin_eq_tags = ['B-LINEAR-SYSTEMS-OF-EQUATIONS',	'I-LINEAR-SYSTEMS-OF-EQUATIONS', 	'I-LINEAR-SYSTEMS-OF-EQUATIONS', 'E-LINEAR-SYSTEMS-OF-EQUATIONS']
theorem_input = [9988,	10384	]
theorem_tags = ['B-FOUNDATIONS-OF-MATHEMATICS', 'E-FOUNDATIONS-OF-MATHEMATICS']
determin_input = [1260,	2083,	14503,	5240]
determin_tags = ['S-DETERMINANTS',	'IGN',	'IGN',	'IGN']
lin_comb_input = [7378,	16058]
lin_comb_tags = ['B-LINEAR-ALGEBRA',	'E-LINEAR-ALGEBRA']
vector_input = [9479,	6966]
vector_tags = ['B-TOPOLOGY',	'E-TOPOLOGY']
multi_d_input = [4321,	3309,	2354,	24533,	6966]
multi_d_tags = ['B-LINEAR-ALGEBRA',	'IGN',	'IGN',	'IGN',	'E-LINEAR-ALGEBRA']
stat_sig_input = [11435,	1193,	2418]
stat_sig_tags = ['B-PROBABILITY-AND-STATISTICS',	'IGN',	'E-PROBABILITY-AND-STATISTICS']
per_frob_input = [14286,	3484,	118,	13359,	21367,	19206]
per_frob_tags = ['B-MATRICES',	'IGN',	'IGN',	'E-MATRICES',	'IGN',	'IGN']
eigenvalue_input_ii = [174,	13417,	1179,	7501,	4175]
eigenvalue_tags_ii = ['S-MATRICES',	'IGN',	'IGN',	'IGN',	'IGN']

In [None]:
# List of tuples containing (input_id_list, tag_list)
replacement_lists = [
    (eigenvector_input, eigenvector_tags),
    (eigenvalue_input, eigenvalue_tags),
    (hermitian_input, hermitian_tags),
    (normal_input, normal_tags),
    (alg_input, alg_tags),
    (alg_input_i, alg_tags_i),
    (scalar_input, scalar_tags),
    (eigenvector_input_i, eigenvector_tags_i),
    (linear_trans_input, linear_trans_tags),
    (systems_lin_eq_input, systems_lin_eq_tags),
    (theorem_input, theorem_tags),
    (determin_input, determin_tags),
    (lin_comb_input, lin_comb_tags),
    (vector_input, vector_tags),
    (multi_d_input, multi_d_tags),
    (stat_sig_input, stat_sig_tags),
    (per_frob_input, per_frob_tags),
    (eigenvalue_input_ii, eigenvalue_tags_ii)
]

In [None]:
# Iterate over lin_alg keys and apply replacements
for key, sub_dict in lin_alg.items():
  for input_id_list, tag_list in replacement_lists:
    sub_dict = replaces_in_lists(input_id_list, tag_list, sub_dict)
  lin_alg[key] = sub_dict  # Update dictionary with modified sub_dict

## Train-test split

In [None]:
# Function to count the tag definition per tag-list
def returns_count_per_class(ner_tag_list):
  my_dict = {}
  for tag in ner_tag_list:
    if tag not in ['O', 'IGN']:
      tag = '-'.join(tag.split('-')[1:])
    if tag not in my_dict:
      my_dict[tag] = 1
    else:
      my_dict[tag] += 1
  sorted_data = dict(sorted(my_dict.items(), key=lambda item: item[1], reverse=True))
  return sorted_data

In [None]:
# Function to summarise main tags
def returns_main_tags(all_data_amended):
  name, count_list, no_tag_list = [], [], []

  for key, sub_dict in all_data_amended.items():
    my_list, my_list_i = [],[]
    to_count = sub_dict['ner_tags']
    count = returns_count_per_class(to_count)

    for i, (k, v) in enumerate(count.items()):
      if k in ['O', 'IGN']:
        continue
      else:
        my_list.append(k)
        my_list_i.append(key)
    try:
      count_list.append(my_list[0])
      name.append(my_list_i[0])
    except:
      no_tag_list.append(key)
      continue

  for_df = {'name': name, 'main_concept': count_list}
  df = pd.DataFrame(for_df)
  summary = df[['main_concept']].groupby('main_concept').value_counts().reset_index(drop=False)
  summary = summary.sort_values(by="count", ascending=False).reset_index(drop=True)
  return summary, df, no_tag_list

In [None]:
summary, label_df, no_tag_list = returns_main_tags(lin_alg)

for tag in no_tag_list:
  try:
    del lin_alg[tag]
  except:
    continue

assert len(lin_alg) == label_df.shape[0], 'Error'

In [None]:
# Check order of label_df same as all_data_amended (NB for indices for train-test split)
for i, key in enumerate(list(lin_alg.keys())):
  assert label_df.iloc[i]['name'] == key, 'Error'
label_df = label_df.reset_index(drop=False)
label_df.head()

Unnamed: 0,index,name,main_concept
0,0,involutive banach algebra,CALCULUS-AND-ANALYSIS
1,1,reduced echelon form,MATRICES
2,2,projective general unitary group,DISCRETE-MATHEMATICS
3,3,Gauss-Jordan Elimination Method,MATRICES
4,4,Nonpositive Matrix,CALCULUS-AND-ANALYSIS


In [None]:
label_df.drop(label_df.loc[label_df['main_concept'] == 'ORG'].index, inplace=True)
label_df.drop(label_df.loc[label_df['main_concept'] == 'LOC'].index, inplace=True)

In [None]:
X_train_indices, X_valid_indices, y_train_indices, y_valid_indices = train_test_split(label_df['index'].to_numpy(), label_df['main_concept'].to_numpy(),
                                                                                    test_size=0.3, random_state=42, stratify=label_df['main_concept'].to_numpy())

In [None]:
# Check the resulting distributions
print("Class distribution in original dataset:")
print(label_df['main_concept'].value_counts(normalize=True))

print("\nClass distribution in train dataset:")
print(pd.Series(y_train_indices).value_counts(normalize=True))

print("\nClass distribution in validation dataset:")
print(pd.Series(y_valid_indices).value_counts(normalize=True))

Class distribution in original dataset:
main_concept
MATRICES                       0.404430
CALCULUS-AND-ANALYSIS          0.153797
ALGEBRA                        0.089873
LINEAR-ALGEBRA                 0.062658
DISCRETE-MATHEMATICS           0.051899
TOPOLOGY                       0.051899
NUMBER-THEORY                  0.034810
DETERMINANTS                   0.034177
LINEAR-SYSTEMS-OF-EQUATIONS    0.028481
GEOMETRY                       0.024051
PROBABILITY-AND-STATISTICS     0.021519
APPLIED-MATHEMATICS            0.021519
FOUNDATIONS-OF-MATHEMATICS     0.009494
RECREATIONAL-MATHEMATICS       0.006962
LIE-ALGEBRA                    0.004430
Name: proportion, dtype: float64

Class distribution in train dataset:
MATRICES                       0.404159
CALCULUS-AND-ANALYSIS          0.153707
ALGEBRA                        0.089512
LINEAR-ALGEBRA                 0.062387
DISCRETE-MATHEMATICS           0.051537
TOPOLOGY                       0.051537
NUMBER-THEORY                  0.035

In [None]:
write_pickle(lin_alg, 'lin_alg')

In [None]:
indices = [X_train_indices, y_train_indices, X_valid_indices, y_valid_indices]
names = ['X_train_indices_la', 'y_train_indices_la', 'X_valid_indices_la', 'y_valid_indices_la']

for idx, name in zip(indices, names):
  write_pickle(idx, name)