In [44]:
! pip install umap-learn -U kaleido



In [45]:
from google.colab import drive, userdata
import pickle
from openai import OpenAI
import random
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import re
from umap import UMAP
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px

In [46]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [47]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [48]:
# Load the tokenizer and finetuned model
model_name = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [49]:
try:
  lin_alg_df_merg_embeddings = read_pickle('lin_alg_embeddings.pkl')
except FileNotFoundError:
  embedding_list = []

  for sentence in tqdm(lin_alg_df_merged['Sentence'], desc="Processing sentences"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # Get inpput_ids and attention mask
    # inputs = {k : v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
      outputs = model(**inputs) # No gradient calc -> speeds things up
      cls_embedding = outputs.last_hidden_state[:,0] # get the embedding for the CLS token (first token)
      embedding_list.append(cls_embedding.cpu().numpy())
  lin_alg_df_merged['label_embedding'] = embedding_list
  lin_alg_df_merg_embeddings = lin_alg_df_merged
  with open('lin_alg_definitions.pkl', 'wb') as file:
    pickle.dump(lin_alg_df_merg_embeddings, file)

In [50]:
lin_alg_df_merg_embeddings.head(2)

Unnamed: 0,Sentence,Subject,Predicate,Object,Label,label_embedding
0,An involutive Banach algebra is primarily char...,involutive banach algebra,characterized by,the existence of an involution operation that ...,defined as,"[[-0.12714861, -0.17931628, 0.2756023, -0.6379..."
1,Involutive Banach algebras are significantly u...,involutive banach algebra,used in,functional analysis,used in,"[[-0.12393479, -0.23532511, 0.25136113, -0.583..."


In [51]:
X_scaled = MinMaxScaler().fit_transform(np.vstack(lin_alg_df_merg_embeddings['label_embedding'].to_numpy()))

# UMAP to create labels on lower datata manifold
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [52]:
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/03-Predicate'

/content/drive/MyDrive/Colab Notebooks/Math_Graph/03-Predicate


In [53]:
my_df = pd.DataFrame(mapper.embedding_, columns=['umap_x', 'umap_y'])
my_df['label'] = lin_alg_df_merg_embeddings['Label']
my_df['sentence'] = lin_alg_df_merg_embeddings['Sentence']
my_df['subject'] = lin_alg_df_merg_embeddings['Subject']
my_df = my_df.sort_values(by=['label'])

fig = px.scatter(my_df, x="umap_x", y="umap_y", color=str("label"), hover_data=['sentence'], facet_col="label", facet_col_wrap=3, color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=1200, height=1300)
fig.write_image("UMAP_Pretraining.png")
fig.show()

In [54]:
my_df.to_csv('UMAP_Pretraining.csv', index=False)

## Comparing performance of span-trained classifier

In [55]:
# Load the tokenizer and finetuned model
model_name = "Heather-Driver/distilbert-classn-LinearAlg-finetuned-pred-span-width-5"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [56]:
try:
  lin_alg_df_merg_embeddings = read_pickle('/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files/lin_alg_embeddings_trained.pkl')
except FileNotFoundError:
  embedding_list = []

  for sentence in tqdm(lin_alg_df_merged['Sentence'], desc="Processing sentences"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # Get inpput_ids and attention mask
    # inputs = {k : v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
      outputs = model(**inputs) # No gradient calc -> speeds things up
      cls_embedding = outputs.last_hidden_state[:,0] # get the embedding for the CLS token (first token)
      embedding_list.append(cls_embedding.cpu().numpy())
  lin_alg_df_merged['label_embedding'] = embedding_list
  lin_alg_df_merg_embeddings = lin_alg_df_merged
  with open('lin_alg_embeddings_trained.pkl', 'wb') as file:
    pickle.dump(lin_alg_df_merg_embeddings, file)

In [57]:
X_scaled = MinMaxScaler().fit_transform(np.vstack(lin_alg_df_merg_embeddings['label_embedding'].to_numpy()))

# UMAP to create labels on lower datata manifold
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [58]:
my_df = pd.DataFrame(mapper.embedding_, columns=['umap_x', 'umap_y'])
my_df['label'] = lin_alg_df_merg_embeddings['Label']
my_df['sentence'] = lin_alg_df_merg_embeddings['Sentence']
my_df['subject'] = lin_alg_df_merg_embeddings['Subject']
my_df = my_df.sort_values(by=['label'])

fig = px.scatter(my_df, x="umap_x", y="umap_y", color=str("label"), hover_data=['sentence'], facet_col="label", facet_col_wrap=3, color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=1200, height=1300)
fig.write_image("UMAP_Posttraining.png")
fig.show()

In [59]:
my_df.to_csv('UMAP_Posttraining.csv', index=False)

In [60]:
my_df.head()

Unnamed: 0,umap_x,umap_y,label,sentence,subject
854,16.318182,4.786982,component,The invertible matrix theorem has components t...,invertible matrix theorem
1025,18.079285,5.788503,component,An orthonormal basis is a component of a Hilbe...,Orthonormal basis
255,12.888448,5.251451,component,The refined alternating sign matrix conjecture...,refined alternating sign matrix conjecture
709,14.212613,6.945241,component,The invariant factor is a critical component i...,invariant factor
262,17.513166,6.568578,component,The concept of dual vector space has component...,Dual Vector Space


In [66]:
# Summary of direct references
my_dict = dict()
for i, row in my_df.iterrows():
  if row.label in row.sentence:
    my_dict[i] = 1
  elif row.label == 'generalisation' and 'generalize' in row.sentence:
    my_dict[i] = 1
  elif row.label == 'specialization' and 'specialize' in row.sentence:
    my_dict[i] = 1
  elif row.label == 'verification' and 'verif' in row.sentence:
    my_dict[i] = 1
  elif row.label == 'implementation' and 'implement' in row.sentence:
    my_dict[i] = 1
  else:
    my_dict[i] = 0

my_df['direct_ref'] = my_df.index.map(my_dict.get)
my_df['direct_ref'] = my_df['direct_ref'].fillna(0)

In [67]:
my_df.groupby(['label', 'direct_ref']).size().reset_index(name='Total')

Unnamed: 0,label,direct_ref,Total
0,component,0,71
1,component,1,34
2,computation,0,250
3,computation,1,11
4,connected to,0,73
5,connected to,1,16
6,defined as,0,163
7,defined as,1,63
8,generalisation,0,21
9,generalisation,1,31
