In [1]:
! pip install umap-learn -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
from google.colab import drive, userdata
import pickle
from openai import OpenAI
import random
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import re
from umap import UMAP
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px

In [3]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [4]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [5]:
# Load the tokenizer and finetuned model
model_name = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [6]:
try:
  lin_alg_df_merg_embeddings = read_pickle('lin_alg_embeddings.pkl')
except FileNotFoundError:
  embedding_list = []

  for sentence in tqdm(lin_alg_df_merged['Sentence'], desc="Processing sentences"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # Get inpput_ids and attention mask
    # inputs = {k : v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
      outputs = model(**inputs) # No gradient calc -> speeds things up
      cls_embedding = outputs.last_hidden_state[:,0] # get the embedding for the CLS token (first token)
      embedding_list.append(cls_embedding.cpu().numpy())
  lin_alg_df_merged['label_embedding'] = embedding_list
  lin_alg_df_merg_embeddings = lin_alg_df_merged
  with open('lin_alg_definitions.pkl', 'wb') as file:
    pickle.dump(lin_alg_df_merg_embeddings, file)

In [7]:
lin_alg_df_merg_embeddings.head(2)

Unnamed: 0,Sentence,Subject,Predicate,Object,Label,label_embedding
0,An involutive Banach algebra is primarily char...,involutive banach algebra,characterized by,the existence of an involution operation that ...,defined as,"[[-0.12714861, -0.17931628, 0.2756023, -0.6379..."
1,Involutive Banach algebras are significantly u...,involutive banach algebra,used in,functional analysis,used in,"[[-0.12393479, -0.23532511, 0.25136113, -0.583..."


In [8]:
X_scaled = MinMaxScaler().fit_transform(np.vstack(lin_alg_df_merg_embeddings['label_embedding'].to_numpy()))

# UMAP to create labels on lower datata manifold
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)



In [9]:
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/03-Predicate'

/content/drive/MyDrive/Colab Notebooks/Math_Graph/03-Predicate


In [10]:
my_df = pd.DataFrame(mapper.embedding_, columns=['umap_x', 'umap_y'])
my_df['label'] = lin_alg_df_merg_embeddings['Label']
my_df['sentence'] = lin_alg_df_merg_embeddings['Sentence']
my_df['subject'] = lin_alg_df_merg_embeddings['Subject']
my_df = my_df.sort_values(by=['label'])

fig = px.scatter(my_df, x="umap_x", y="umap_y", color=str("label"), hover_data=['sentence'], facet_col="label", facet_col_wrap=3, color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=1200, height=1300)
fig.write_image("UMAP_Pretraining.png")
fig.show()

In [11]:
my_df.to_csv('UMAP_Pretraining.csv', index=False)

## Comparing performance of span-trained classifier

In [12]:
# Load the tokenizer and finetuned model
model_name = "Heather-Driver/distilbert-classn-LinearAlg-finetuned-pred-span-width-5"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [13]:
try:
  lin_alg_df_merg_embeddings = read_pickle('/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files/lin_alg_embeddings_trained.pkl')
except FileNotFoundError:
  embedding_list = []

  for sentence in tqdm(lin_alg_df_merged['Sentence'], desc="Processing sentences"):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # Get inpput_ids and attention mask
    # inputs = {k : v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
      outputs = model(**inputs) # No gradient calc -> speeds things up
      cls_embedding = outputs.last_hidden_state[:,0] # get the embedding for the CLS token (first token)
      embedding_list.append(cls_embedding.cpu().numpy())
  lin_alg_df_merged['label_embedding'] = embedding_list
  lin_alg_df_merg_embeddings = lin_alg_df_merged
  with open('lin_alg_embeddings_trained.pkl', 'wb') as file:
    pickle.dump(lin_alg_df_merg_embeddings, file)

In [14]:
X_scaled = MinMaxScaler().fit_transform(np.vstack(lin_alg_df_merg_embeddings['label_embedding'].to_numpy()))

# UMAP to create labels on lower datata manifold
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [15]:
my_df = pd.DataFrame(mapper.embedding_, columns=['umap_x', 'umap_y'])
my_df['label'] = lin_alg_df_merg_embeddings['Label']
my_df['sentence'] = lin_alg_df_merg_embeddings['Sentence']
my_df['subject'] = lin_alg_df_merg_embeddings['Subject']
my_df = my_df.sort_values(by=['label'])

fig = px.scatter(my_df, x="umap_x", y="umap_y", color=str("label"), hover_data=['sentence'], facet_col="label", facet_col_wrap=3, color_discrete_sequence=px.colors.qualitative.Vivid,
                 width=1200, height=1300)
fig.write_image("UMAP_Posttraining.png")
fig.show()

In [16]:
my_df.to_csv('UMAP_Posttraining.csv', index=False)

In [34]:
my_df.head()

Unnamed: 0,umap_x,umap_y,label,sentence,subject,direct_ref
854,16.885777,4.942344,component,The invertible matrix theorem has components t...,invertible matrix theorem,1.0
1025,18.317816,5.993346,component,An orthonormal basis is a component of a Hilbe...,Orthonormal basis,1.0
255,13.204476,5.343868,component,The refined alternating sign matrix conjecture...,refined alternating sign matrix conjecture,0.0
709,14.122861,7.183143,component,The invariant factor is a critical component i...,invariant factor,1.0
262,17.729881,6.851869,component,The concept of dual vector space has component...,Dual Vector Space,1.0


In [36]:
# Summary of direct references
my_dict = dict()
for i, row in my_df.iterrows():
  if row.label in row.sentence:
    my_dict[i] = 1

my_df['direct_ref'] = my_df.index.map(my_dict.get)
my_df['direct_ref'] = my_df['direct_ref'].fillna(0)

In [43]:
my_df.groupby(['label', 'direct_ref']).size().reset_index(name='Total')

Unnamed: 0,label,direct_ref,Total
0,component,0.0,71
1,component,1.0,34
2,computation,0.0,250
3,computation,1.0,11
4,connected to,0.0,73
5,connected to,1.0,16
6,defined as,0.0,163
7,defined as,1.0,63
8,generalisation,0.0,52
9,implementation,0.0,179
