In [74]:
from google.colab import drive, userdata
import pickle
from openai import OpenAI
import random
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import re

In [75]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [76]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [77]:
# Read in dictionary
lin_alg = read_pickle('lin_alg.pkl')

In [78]:
lin_alg['Analytic torsion']['text']

'Analytic torsion is a concept in mathematics that connects topology, geometry, and analysis, particularly in the study of differential operators on manifolds. It is defined as a regularized determinant of the Laplace operator associated with a geometric structure, incorporating both the curvature of the manifold and the topology of its underlying space. This concept plays a crucial role in various areas, such as quantum field theory and the study of modular forms.'

In [79]:
entities = list(lin_alg.keys())

In [80]:
predicate_terms = ["analyzes", "applies", "documents",  "has component",  "has subclass",  "implements",   "instantiates",   "invents",   "analyzed in", "applied in", "component of",  "documented in", "implemented by",
                   "instance of", "is invented in",  "related to",  "solved by",  "is studied in",   "is subclass of",   "is surveyed in",  "tested by",  "used in", "solves",  "specialized by", "specializes", "studies",
                   "surveys", "tests", "uses", "presented with", "is equivalent to", "transformed by", "depends on", "is derived from", "is part of", "optimized by", "generalized by", "is constrained by", "is parameterized by",
                   "influenced by", "simplified as", "is described as", "is verified by", "is computed using", "is formalized as", "is expressed in terms of", "implemented in", "represented by", "generalized to", "is a solution to",
                   "part of the foundation for", "computes over", "evaluated by"]

In [81]:
# Get the API key and set the model name
model = "gpt-4o-mini"
client = OpenAI(api_key=userdata.get('ChatGPT'))

# Function to return answers based on prompt
def gpt_answers(topic, terms_str):
  completion = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system",
       "content": f"""I will give you a topic and you must choose 1 of the most relevant terms from the group: {terms_str} and create an explanatory sentence for each of them using this.
       The output must not include mathematical notation or history. Also highlight what the predicate and the object is, from the sentence.
                  Example Topic: Linearly dependent vectors
                  Answer:
                  sentence: Linearly dependent vectors are fundamentally related to the presence of at least one vector that can be expressed as a non-trivial linear combination of the remaining vectors.
                  predicate: 'related to'
                  object: 'a non-trivial linear combination of remaining vectors'
                  End of example. """
                  }, # system message that provides context to the model
      {"role": "user",
       "content": f"""Provide 2 informative sentences on: {topic}"""}  # topic query
    ],
    temperature=1,
    max_tokens=1024,
    top_p=1
  )

  response = completion.choices[0].message.content
  return response

In [82]:
responses = read_pickle('predicate_responses_lin_alg.pkl')

In [83]:
for entity in tqdm(entities, desc="Processing definitions"):
  if entity not in responses.keys():
    terms = np.random.choice(predicate_terms, size=4)
    terms_str = ", ".join(terms)
    time.sleep(0.1)

    responses[entity] = gpt_answers(entity, terms_str)
    with open('predicate_responses_lin_alg.pkl', 'wb') as file:
      pickle.dump(responses, file)

Processing definitions: 100%|██████████| 1583/1583 [00:00<00:00, 1404905.47it/s]


In [84]:
print(responses['Hankel Matrix'])

1. Sentence: A Hankel matrix is characterized by its constant skew-diagonals, meaning that each ascending skew-diagonal has the same value across the matrix.  
   Predicate: 'is characterized by'  
   Object: 'its constant skew-diagonals'

2. Sentence: Hankel matrices are often used in polynomial interpolation and time series analysis to model the relationships between data points.  
   Predicate: 'are used in'  
   Object: 'polynomial interpolation and time series analysis'


In [85]:
pattern = re.findall(r'(\d+)\.\s*Sentence:\s*(.*?)\s*Predicate:\s*\'(.*?)\'\s*Object:\s*\'(.*?)\'', responses['Hankel Matrix'], re.DOTALL)
# Construct the nested dictionary
nested_dict = {
    int(num): {
        "Sentence": sentence.strip(),
        "Predicate": predicate.strip(),
        "Object": obj.strip()
    }
    for num, sentence, predicate, obj in pattern
}

In [86]:
print(nested_dict)

{1: {'Sentence': 'A Hankel matrix is characterized by its constant skew-diagonals, meaning that each ascending skew-diagonal has the same value across the matrix.', 'Predicate': 'is characterized by', 'Object': 'its constant skew-diagonals'}, 2: {'Sentence': 'Hankel matrices are often used in polynomial interpolation and time series analysis to model the relationships between data points.', 'Predicate': 'are used in', 'Object': 'polynomial interpolation and time series analysis'}}


In [87]:
entities = dict()
my_list = []
for entity in responses.keys():
  pattern = re.findall(r'(\d+)\.\s*Sentence:\s*(.*?)\s*Predicate:\s*\'(.*?)\'\s*Object:\s*\'(.*?)\'', responses[entity], re.DOTALL)
  # Construct the nested dictionary
  nested_dict = {
      int(num): {
          "Sentence": sentence.strip(),
          "Subject": entity,
          "Predicate": predicate.strip(),
          "Object": obj.strip()
          }
      for num, sentence, predicate, obj in pattern
      }
  entities[entity] = nested_dict
  my_list.append(pd.DataFrame.from_dict(entities[entity]).T)

lin_alg_df =  pd.concat(my_list)
lin_alg_df = lin_alg_df.reset_index(drop=True)

In [88]:
lin_alg_df.head(2)

Unnamed: 0,Sentence,Subject,Predicate,Object
0,An involutive Banach algebra is primarily char...,involutive banach algebra,characterized by,the existence of an involution operation that ...
1,Involutive Banach algebras are significantly u...,involutive banach algebra,used in,functional analysis


In [89]:
lin_alg_df.shape

(1720, 4)

In [90]:
tagged_responses = 'predicate_tags.pkl'
tagged_responses = read_pickle(tagged_responses)
tagged_responses.columns = ['Subject', 'Sentence', 'Predicate', 'Object']
tagged_responses.head(2)

Unnamed: 0,Subject,Sentence,Predicate,Object
0,ludwig's inversion formula,"Ludwig's inversion formula, a mathematical exp...",is relevant for,determining probability distributions
1,ludwig's inversion formula,This formula is implemented using integration ...,is implemented using,integration techniques


In [92]:
tagged_responses_list = tagged_responses['Subject'].unique().tolist()
tagged_responses_lower = [x.lower() for x in tagged_responses_list]

In [93]:
# For simplicity, only want the linear algebra entities
my_list = []
for subject in lin_alg_df['Subject'].unique():
  subj_lower = subject.lower()
  if subj_lower in tagged_responses_lower:
    x = tagged_responses_lower.index(subj_lower)
    tagged_subject = tagged_responses_list[x]
    tagged_subject_subset = tagged_responses.loc[tagged_responses['Subject'] == tagged_subject]
    to_merge = tagged_subject_subset[['Sentence', 'Subject', 'Predicate', 'Object']]
    my_list.append(to_merge)

my_df = pd.concat(my_list)
lin_alg_df = pd.concat([lin_alg_df, my_df], axis=0, ignore_index=True)

In [94]:
lin_alg_df.shape

(2036, 4)

In [95]:
lin_alg_df['Predicate'].value_counts().to_excel('for_cleaning.xlsx')
cleaned = pd.read_excel('cleaned.xlsx')

In [96]:
lin_alg_df_merged = pd.merge(lin_alg_df, cleaned, on='Predicate', how='inner')
lin_alg_df_merged = lin_alg_df_merged.rename(columns={'cleaned': 'Label'})

In [97]:
lin_alg_df_merged.head(2)

Unnamed: 0,Sentence,Subject,Predicate,Object,Label
0,An involutive Banach algebra is primarily char...,involutive banach algebra,characterized by,the existence of an involution operation that ...,defined as
1,Involutive Banach algebras are significantly u...,involutive banach algebra,used in,functional analysis,used in
