<a href="https://colab.research.google.com/github/HeatherDriver/MathGraph/blob/main/00_Wolfram_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pickle
from google.colab import drive
import random

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [3]:
# Scrape main structure from Wolfram Mathworld

url_to_scrape = 'https://mathworld.wolfram.com/'

response = requests.get(url_to_scrape)

soup_object = BeautifulSoup(response.text)

branches = soup_object.find_all('h3', href=False)
branches = [tag.text for tag in branches]

In [4]:
# Remove numbers and symbols from scraped text

def clean_text(text):
    return re.sub(r'@\Z', '', re.sub(r'\(\d+\)', '', text))

def split_camel_case(text):
    # Use regex to insert a space before uppercase letters preceded by lowercase letters
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

In [5]:
# Highest level dictionary eg has 'Algebra' = ['Polynomials', 'Wavelets', ...]
try:
  with open('topics.pkl', 'rb') as f:
    topics = pickle.load(f)

except:
  topics = dict()

  for i, branch in enumerate(branches):
    if branch not in ['New in MathWorld', 'History and Terminology']:
      branch_to_find = branch.replace(' ', '')
      url_to_scrape = f"https://mathworld.wolfram.com/topics/{branch_to_find}.html"

      response = requests.get(url_to_scrape)
      if response.status_code == 200:
        soup_object = BeautifulSoup(response.text)

      ul_tag = soup_object.find('ul', class_='topics-list three-columns two-columns__900')
      if not ul_tag:
        ul_tag = soup_object.find('ul', class_='topics-list two-columns__900')

      if ul_tag:
        li_tags = ul_tag.find_all('li')
        topics_list = [li.get_text(strip=True) for li in li_tags]
        topics_list = [clean_text(item) for item in topics_list]
        topics[branch] = topics_list

      cleaned_dict = {key: [clean_text(item) for item in values] for key, values in topics.items()}
      cleaned_dict_no_spaces = {key: [item.replace(' ', '') for item in values] for key, values in cleaned_dict.items()}

  with open('topics.pkl', 'wb') as f:
    pickle.dump(cleaned_dict, f)

In [6]:
# Lower level dictionary eg has 'Ring Theory' = ['Integral Element', 'Dedekind Ring', ...]

try:
  with open('sub_topics.pkl', 'rb') as f:
    sub_topics = pickle.load(f)
except:
  sub_topics = dict()
  sub_topics_ii = dict()

  for key, value_list in cleaned_dict_no_spaces.items():
    for i, value in enumerate(value_list):
      url_to_scrape = f"https://mathworld.wolfram.com/topics/{value}.html"
      response = requests.get(url_to_scrape)
      if response.status_code == 200:
        soup_object = BeautifulSoup(response.text)

        ul_tag = soup_object.find('ul', class_='topics-list three-columns two-columns__900')

        if not ul_tag:
          ul_tag = soup_object.find('ul', class_='topics-list two-columns__900')

        if not ul_tag:
          first_ul = soup_object.find('ul')
          ul_tag = first_ul.find_next('ul') if first_ul else None

        if ul_tag:
          li_tags = ul_tag.find_all('li')
          topics = [li.get_text(strip=True) for li in li_tags]
          topics = [clean_text(item) for item in topics]
          sub_topics[value] = topics
          if i == 0:
            sub_topics_ii[key] = topics
          else:
            sub_topics_ii[key].extend(topics)

        cleaned_dict = {key: [clean_text(item) for item in values] for key, values in sub_topics.items()}
        cleaned_dict_ii = {key: [clean_text(item) for item in values] for key, values in sub_topics_ii.items()}
        cleaned_dict_no_spaces = {key: [item.replace(' ', '') for item in values] for key, values in cleaned_dict.items()}

        with open('sub_topics.pkl', 'wb') as f:
          pickle.dump(cleaned_dict_ii, f)

In [7]:
# Additional levels for linear algebra topics
alg_2 = dict()

alg_2['Determinants'] = ["Casoratian", "Cauchy's Determinant Theorem", "Cayley-Menger Determinant",  "Chió Pivotal Condensation", "Circulant Determinant", "Cofactor",
                         "Condensation", "Determinant", "Determinant Expansion by Minors", "Determinant Identities", "Determinant Theorem", "Gram Determinant", "Gram's Inequality",
                         "Hadamard's Maximum Determinant Problem", "Hadamard's Theorem", "Hafner-Sarnak-McCurley Constant", "Hessian", "Hill Determinant", "Hyperdeterminant", "Inversion Number",
                         "Jacobi's Determinant Identity", "Jacobi's Theorem", "Jacobian", "Mills-Robbins-Rumsey Determinant Formula", "Minor", "Pfaffian" "Pivotal Condensation", "Schweins's Theorem",
                         "Stäckel Determinant", "Sylvester's Determinant Identity", "Vandermonde Determinant"]

alg_2["General Linear Algebra"] = ["Alternating Multilinear Form", "Bilinear Basis", "Complex Vector Space", "Fredholm's Theorem", "Fundamental Matrix Subspaces", "Fundamental Theorem of Linear Algebra",
                                   "Haar Condition", "Hermitian Inner Product", "Invertible Linear Map", "Kernel", "Linear Algebra", "Linear Combination", "Linear Function", "Linear Transformation Kernel",
                                   "Linearly Independent", "Lorentzian Inner Product", "Null Space", "Nullity", "Nullspace", "Orthogonal Complement", "Orthogonal Set", "Orthogonal Sum", "Orthogonal Transformation",
                                   "Orthogonality Condition", "Orthonormal Basis", "Orthonormal Set", "Piecewise Linear Function", "Quotient Vector Space", "Rank-Nullity Theorem", "Real Vector Space", "Reduced Whitehead Group",
                                   "Vector Space Flag", "Vector Space Orientation", "Vector Space Projection", "Whitehead Group"]

alg_2["Lie Theory"] = ["Lie Algebra", "Lie Groups"] #

alg_2["Linear Independence"] = ["Fredholm's Theorem", "Linearly Dependent Functions", "Linearly Dependent Vectors", "Linearly Independent", "Orthogonal", "Vector Basis", "Wronskian"]

alg_2["Linear Systems of Equations"] = ["Basis Vector", "Change of Basis", "Change of Coordinates Matrix", "Cramer's Rule", "Gauss-Seidel Method", "Jacobi Method", "Linear Equation", "Linear System of Equations",
                                        "Linear Transformation", "Standard Basis", "Stationary Iterative Method", "Successive Overrelaxation Method", "Symmetric Successive Overrelaxation Method", "Vector Basis"]

alg_2["Matrices"] = [ "Integer Matrices", "Matrix Decomposition", "Matrix Eigenvalues", "Matrix Groups" "Matrix Inversion", "Matrix Norms", "Matrix Operations", "Matrix Properties", "Matrix Types"] #

alg_2["Permanents"] = [ "Frobenius-König Theorem", "Permanent", "Ryser Formula"]

with open('alg_2.pkl', 'wb') as f:
  pickle.dump(alg_2, f)

In [8]:
# Even lower level for lie algebra and matrices
alg_3 = dict()

alg_3["Lie Algebra"] = ["Adjoint Representation", "Ado's Theorem", "Borel Algebra", "Cartan Algebra", "Cartan Matrix", "Cartan Subalgebra", "Casimir Operator", "Chevalley-Serre Relations", "Commutator", "Derivation Algebra",
                        "Dynkin Diagram", "Engel's Theorem", "Exceptional Lie Algebra", "Highest Weight Theorem", "Iwasawa's Theorem", "Jacobi Identities", "Killing Form", "Lie Algebra", "Lie Algebra Commutator Series",
                        "Lie Algebra Lower Central Series", "Lie Algebra Representation", "Lie Algebra Root", "Lie Algebra Simple Root", "Lie Algebra Weight", "Lie Algebroid", "Lie Bracket", "Lie Product", "Lie Subalgebra",
                        "Macdonald's Constant-Term Conjecture", "Nilpotent Lie Algebra", "Poincaré-Birkhoff-Witt Theorem", "Reduced Root System", "Root Lattice", "Root System", "Semisimple Lie Algebra", "Serre Relations",
                        "Simple Lie Algebra", "Solvable Lie Algebra", "Structure Constant", "Weyl Group"]

alg_3["Lie Groups"] = ["Anyon", "Associated Vector Bundle", "Chevalley Groups", "Commutator", "Compact Lie Group", "Exponential Map", "Flag Manifold", "Homogeneous Space", "Jacobi Identities", "Lie Group", "Lie Group Quotient Space",
                       "Lie-Type Group", "Nil Geometry", "Nilmanifold", "Nilpotent Lie Group", "Principal Bundle", "Semisimple Lie Group", "Sol Geometry", "Solvable Lie Group", "Special Linear Group", "Special Unitary Group",
                       "Topological Group", "Twisted Chevalley Groups"]

alg_3["Integer Matrices"] = ["Alternating Sign Matrix", "Antimagic Square", "Binary Matrix" "Boolean Matrix", "C-Matrix", "Frobenius-König Theorem", "Gale-Ryser Theorem", "Hadamard's Maximum Determinant Problem",
                             "Hafner-Sarnak-McCurley Constant", "Hard Square Entropy Constant", "Heterosquare", "Identity Matrix", "Incidence Matrix", "Integer Matrix", "Interspersion", "Lam's Problem", "Logical Matrix",
                             "Mortal", "Mortality Problem", "Null Matrix", "Paley Class", "Paley Construction", "Paley's Theorem", "Refined Alternating Sign Matrix Conjecture", "Relation Matrix", "Special Matrix",
                             "Stolarsky Array", "Unit Matrix", "Weisstein's Conjecture", "Zero Matrix"]

alg_3["Matrix Decomposition"] = ["Cholesky Decomposition", "Eigen Decomposition", "Eigen Decomposition Theorem", "Eigenvalue", "Eigenvector", "Fundamental Matrix Subspaces", "Fundamental Theorem of Linear Algebra",
                                 "Hermite Normal Form", "Hessenberg Decomposition", "Jacobson Canonical Form", "Jordan Basis", "Jordan Block", "Jordan Canonical Form", "Jordan Matrix Decomposition", "Le Paige's Theorem",
                                 "LU Decomposition", "Matrix Decomposition", "Matrix Diagonalization", "Orthogonal Decomposition", "QR Decomposition", "Rational Canonical Form", "Schur Decomposition", "Singular Value",
                                 "Singular Value Decomposition", "Smith Normal Form" ]

alg_3["Matrix Eigenvalues"] = [ "Ballieu's Theorem", "Characteristic Polynomial", "Characteristic Root", "Characteristic Value", "Determined by Spectrum", "Eigen Decomposition", "Eigenspace", "Eigenvalue", "Eigenvector",
                               "Frobenius Theorem", "Generalized Eigenvector", "Gerschgorin Circle Theorem", "Gersgorin Circle Theorem", "Gershgorin Circle Theorem", "Girko's Circular Law", "Graph Spectrum", "Haemers Conjecture",
                                "Integral Graph", "Lanczos Algorithm", "Left Eigenvector", "Lyapunov's First Theorem", "Lyapunov's Second Theorem", "Majorization", "Matrix Diagonalization", "Matrix Spectrum", "McCoy's Theorem",
                                "Ostrowski's Theorem", "Parodi's Theorem", "Perron-Frobenius Theorem", "Perron's Theorem", "Positive Eigenvalued Matrix", "Right Eigenvector", "Routh-Hurwitz Theorem", "Schur Decomposition",
                                "Schur's Inequalities", "Singular Value", "Spectral Norm", "Spectral Radius", "Spectrally Unique", "Wielandt's Theorem", "Wigner's Semicircle Law" ]

alg_3["Matrix Groups"] = [ "General Linear Group", "Heisenberg Group", "Lie-Type Group", "Linear Algebraic Group", "Maschke's Theorem", "Matrix Group", "Orthogonal Group", "Rotation Group", "Special Linear Group", "Special Orthogonal Group",
                          "Special Unitary Group", "Symplectic Group" ]

alg_3["Matrix Inversion"] = [ "Drazin Inverse", "Inverse Matrix", "Matrix 1-Inverse", "Matrix Inverse", "Matrix Inversion", "Moore-Penrose Matrix Inverse", "Pseudoinverse" ]

alg_3["Matrix Norms"] = [ "Compatible", "Euclidean Norm", "Frobenius Norm", "Hilbert-Schmidt Norm", "Matrix Norm", "Matrix p-Norm", "Maximum Absolute Column Sum Norm", "Maximum Absolute Row Sum Norm", "Natural Norm",
                         "Spectral Norm" ]

alg_3["Matrix Operations"] = ["Antihermitian Part", "Antisymmetric Part", "Conjugate Transpose", "Conjugate Transpose Matrix", "Echelon Form", "Elementary Row and Column Operations", "Fundamental Matrix Subspaces",
                              "Fundamental Theorem of Linear Algebra", "Gauss-Jordan Algorithm", "Gauss-Jordan Elimination", "Gauss-Jordan Elimination Method", "Gauss-Jordan Method", "Gaussian Elimination", "Hermitian Part",
                              "Hermitian Transpose", "Infinitesimal Matrix Change", "Invertible Matrix Theorem", "Jacobi Transformation", "Kronecker Product", "Kronecker Sum", "Matrix Addition", "Matrix Diagonalization",
                              "Matrix Direct Product", "Matrix Direct Sum", "Matrix Equality", "Matrix Exponential", "Matrix Fraction", "Matrix Multiplication", "Matrix Power", "Matrix Product", "Moore-Penrose Pseudoinverse",
                              "Normal Equation", "Pivoting", "Reduced Echelon Form", "Reduced Row Echelon Form", "Row Canonical Form", "Row Echelon Form", "Row-Reduced Echelon Form", "Sherman-Morrison Formula",
                              "Skew Hermitian Part", "Square Root Method", "Strassen Formulas", "Symmetric Part", "Transpose", "Woodbury Formula"  ]

alg_3["Matrix Properties"] = ["Augmented Matrix", "Bandwidth", "Bourque-Ligh Conjecture", "Cayley-Hamilton Theorem", "Characteristic Equation", "Combinatorial Matrix Theory", "Condition Number", "Diagonal", "Diagonalizable Matrix",
                              "Fredholm's Theorem", "Grothendieck's Constant", "Horn's Theorem", "Ill-Conditioned Matrix", "Immanant", "Invariant Factor", "Matrix", "Matrix Equality", "Matrix Minimal Polynomial", "Matrix Rank",
                              "Matrix Signature", "Matrix Trace", "Roth's Removal Rule", "Segre Characteristic", "Singular System", "Skew Diagonal", "Sturmian Separation Theorem", "Subdiagonal", "Superdiagonal",
                              "Sylvester's Criterion", "Totally Positive Matrix", "Transition Matrix"]

alg_3["Matrix Types"] = ['Array', 'Column Space', 'Commuting Matrices', 'Congruent Matrices', 'Conjugate Transpose', 'Adjacency Matrix', 'DiagonalMatrixQ', 'Adjoint Matrix', 'Adjugate Matrix', 'Alternant Matrix', 'Antihermitian Matrix',
                         'Antimetric Matrix', 'Antisymmetric Matrix', 'Asymmetric Matrix', 'Bisymmetric Matrix', 'Block Diagonal Matrix', 'Block Matrix', 'Bohr Matrix', 'Centrosymmetric Matrix', 'Circulant Matrix', 'Companion Matrix',
                         'Positive Matrix', 'Complex Matrix', 'Conjugate Matrix', 'Copositive Matrix', 'Covariance Matrix', 'Coxeter Matrix', 'Fibonacci Q-Matrix', 'Defective Matrix', 'Diagonal Matrix', 'Diagonally Dominant Matrix',
                         'Gamma Matrix', 'Nonnegative Matrix', 'Stochastic Matrix', 'Elementary Matrix', 'Equivalent Matrix', 'Fourier Matrix', 'Generalized Gell-Mann Matrix', 'Dirac Matrices', 'Generalized Vandermonde Matrix', 'Gram Matrix',
                         'Hadamard Matrix', 'Hamiltonian Matrix', 'Hankel Matrix', 'Hermitian Matrix', 'Hessenberg Matrix', 'Hilbert Matrix', 'Householder Matrix', 'Idempotent Matrix', 'Identity Matrix', 'Completely Positive Matrix',
                         'Conditioned Matrix', 'Indefinite Matrix', 'Involutory Matrix', 'Irreducible Matrix', 'Rotation Matrix', 'Kac Matrix', 'Multiple Matrix', 'Lower Triangular Matrix', 'Square Matrix', 'Equation Matrix', 'Minimal Matrix',
                         'Rational Canonical Form', 'Monotonic Matrix', 'Definite Matrix', 'Negative Matrix', 'Semidefinite Matrix', 'Nilpotent Matrix', 'Nonnegative Matrix', 'Nonpositive Matrix', 'Nonsingular Matrix', 'Normal Matrix',
                         'Special Orthogonal Matrix', 'Pascal Matrix', 'Payoff Matrix', 'Periodic Matrix', 'Permutation Matrix', 'Polynomial Matrix', 'Definite Matrix', 'Positive Eigenvalued Matrix', 'Positive Matrix',
                         'Positive Semidefinite Matrix', 'Projection Matrix', 'Random Matrix', 'Real Matrix', 'Rectangular Matrix', 'Redheffer Matrix', 'Reducible Matrix', 'Rotation Matrix', 'Scalar Matrix', 'Schur Matrix', 'Seifert Matrix',
                         'Self-Adjoint Matrix', 'Shear Matrix', 'Singular Matrix', 'Skew Hermitian Matrix', 'Pauli Matrices', 'Skew Symmetric Matrix', 'Sparse Matrix', 'Orthogonal Matrix', 'Unitary Matrix', 'Square Matrix', 'Stability Matrix',
                         'Stochastic Matrix', 'Strictly Upper Triangular Matrix', 'Upper Triangular Matrix', 'Sylvester Matrix', 'Symmetric Matrix', 'Magic Square', 'Toeplitz Matrix', 'Tournament Matrix', 'Strictly Lower Triangular Matrix',
                         'Tridiagonal Matrix', 'Unimodular Matrix', 'Unitary Matrix', 'Triangular Matrix', 'Vandermonde Matrix', 'Wavelet Matrix', 'Matrix Polynomial', 'Hypermatrix', 'Jordan Block']

with open('alg_3.pkl', 'wb') as f:
  pickle.dump(alg_3, f)