<a href="https://colab.research.google.com/github/HeatherDriver/MathGraph/blob/main/02_Combined_Dict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install --upgrade openai



In [None]:
from google.colab import drive, userdata
import pickle
import random
from collections import defaultdict
from openai import OpenAI

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [None]:
# Open pickle file and load
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [None]:
# Function to combine wikidata and short answer data for mizar into a single dictionary

def combine_dictionaries(wiki_dict, short_ans_dict):
  wiki_dict = read_pickle(wiki_dict)
  short_ans_dict = read_pickle(short_ans_dict)

  all_keys = set(wiki_dict.keys()).union(set(short_ans_dict.keys()))
  _combined_dict = dict()

  for key in all_keys:
    _combined_dict[key] = {'wikidata': wiki_dict.get(key),  'short_answer': short_ans_dict.get(key)}

  return _combined_dict

In [None]:
combined_mizar_dict = combine_dictionaries("mizar_wikidata_api.pkl", "mizar_short_ans_api.pkl")
combined_wolfram_dict = combine_dictionaries("wolfram_wikidata_api.pkl", "wolfram_short_ans_api.pkl")

combined_mizar_dict = {k.replace("_", " "): v for k, v in combined_mizar_dict.items()}
combined_wolfram_dict = {k.replace("_", " "): v for k, v in combined_wolfram_dict.items()}

In [None]:
# cleaning combined output

all_keys = set(combined_mizar_dict.keys()).union(set(combined_wolfram_dict.keys()))

combined_dict = dict()

for key in all_keys:
  combined_dict[key] = {'mizar': combined_mizar_dict.get(key),  'wolfram': combined_wolfram_dict.get(key)}

my_dict = {}

for key, value in combined_dict.items():
    # keys to lowercase
    lower_key = key.lower()

    # If the lowercase key is not in the cleaned dictionary, add it
    if lower_key not in my_dict:
        my_dict[lower_key] = (key, value)
    else:
        existing_key = my_dict[lower_key][0]
        if key.istitle() and not existing_key.istitle():
            my_dict[lower_key] = (key, value)

final_combined_dict = {key: value for key, value in my_dict.values()}

In [None]:
# Still need last few short definitions - separate these to have definitions in one
get_definition = []
short_answer = dict()

for key in final_combined_dict.keys():
  try:
    definition = final_combined_dict[key]["wolfram"]["short_answer"]
    short_answer.update({key: definition})
  except:
    get_definition.append(key)

In [None]:
# Also append if the short answer is None
for key, value in short_answer.items():
  if value is None:
    get_definition.append(key)
  if value == 'None':
    get_definition.append(key)

In [None]:
# Also append if 'constant' is in the key name - this is because Wolfram short answer reverts to the actual number, eg Conway's Constant = 1.30358 which is non-informative
for key in final_combined_dict.keys():
  if 'constant' in key.lower():
    get_definition.append(key)

# Also append for a few miscellaneous names where Wolfram short answer reverts to the actual number which is non-informative
for key in final_combined_dict.keys():
  if key.lower() in ['combinatorial identities', 'combinatorial optimization', 'difference plot', 'general combinatorics', 'gini coefficient', "god's number",  'graph spectrum', 'l^infty-norm',  'lehmer number',
                     'milliard', 'natural logarithm of 2', 'out-shuffle', 'pi', 'pi squared', 'pi-prime', 'prime arrangements', 'zero', 'e']:
    get_definition.append(key)

In [None]:
# Get the API key and set the model name
model = "gpt-4o-mini"
client = OpenAI(api_key=userdata.get('ChatGPT'))

# Function to return answers based on prompt
def gpt_answers(topic):
  completion = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": f"""I will ask you to provide a short summary of a topic, 2 or 3 sentences long. Output must not include mathematical notation or history.
                  Example:

                  Question:
                  bertrand's postulate

                  Answer:
                  Bertrand's Postulate states that for any number greater than 1, there is always at least one prime number between that number and its double.
                  This postulate ensures that prime numbers are distributed somewhat evenly across the number line, and there is always a prime within this range.
                  It provides a guarantee about the existence of primes in intervals.

                  End of example. """}, # system message that provides context to the model
      {"role": "user", "content": f"""Provide a short summary of: {topic}"""}  # topic query
    ]
  )

  response = completion.choices[0].message.content
  return response

In [None]:
try:
  with open('gpt_responses_dict.pkl', 'rb') as file:
    gpt_responses = pickle.load(file)
except:
  gpt_responses = dict()

  for definition in get_definition:
    gpt_responses[definition] = gpt_answers(definition)

  with open('gpt_responses_dict.pkl', 'wb') as file:
    pickle.dump(gpt_responses, file)

In [None]:
print(gpt_responses["Tietze extension theorem"])

The Tietze Extension Theorem states that any continuous function defined on a closed subset of a normal topological space can be extended to a continuous function on the entire space. This result is significant in topology, as it allows for the preservation of continuity when expanding functions to larger domains. The theorem underscores the importance of normal spaces in the context of function extension.


In [None]:
def deep_update(original, new):
  for key, value in new.items():
    if isinstance(value, dict) and key in original:
      deep_update(original[key], value)
    else:
      original[key] = value

In [None]:
my_dict = dict()
for key, values in gpt_responses.items():
  my_dict.setdefault(key, {}).setdefault('gpt', {})['gpt'] = values

deep_update(final_combined_dict, my_dict)

my_list = [k for k in final_combined_dict.keys()]
my_list.sort()

_final_combined_dict = dict()
for key in my_list:
  _final_combined_dict[key] = final_combined_dict[key]

final_combined_dict = _final_combined_dict

with open('final_combined_dict.pkl', 'wb') as file:
  pickle.dump(final_combined_dict, file)

In [None]:
short_answer.update(gpt_responses)

for keys in short_answer.keys():
  if keys not in final_combined_dict.keys():
    print(keys)

In [None]:
my_list = [k for k in short_answer.keys()]
my_list.sort()

_short_answer = dict()
for key in my_list:
  _short_answer[key] = short_answer[key]

short_answer = _short_answer

with open('short_answer_dict.pkl', 'wb') as file:
  pickle.dump(short_answer, file)

In [None]:
final_combined_dict["Algebraic Curve"]

{'mizar': None,
 'wolfram': {'wikidata': [{'id': 'Q266237',
    'title': 'Q266237',
    'pageid': 258014,
    'concepturi': 'http://www.wikidata.org/entity/Q266237',
    'repository': 'wikidata',
    'url': '//www.wikidata.org/wiki/Q266237',
    'display': {'label': {'value': 'algebraic curve', 'language': 'en'},
     'description': {'value': 'algebraic variety of dimension one',
      'language': 'en'}},
    'label': 'algebraic curve',
    'description': 'algebraic variety of dimension one',
    'match': {'type': 'label', 'language': 'en', 'text': 'algebraic curve'}},
   {'id': 'Q59256198',
    'title': 'Q59256198',
    'pageid': 59162229,
    'concepturi': 'http://www.wikidata.org/entity/Q59256198',
    'repository': 'wikidata',
    'url': '//www.wikidata.org/wiki/Q59256198',
    'display': {'label': {'value': 'Algebraic Curve for the SO(6) Sector of AdS/CFT',
      'language': 'en'},
     'description': {'value': 'scholarly article by N. Beisert et al published 4 March 2006 in Commu