In [1]:
import requests
import re
from bs4 import BeautifulSoup
import wikipedia
import os

## Fetch text data

In [20]:
BASE_URL = "https://en.wikipedia.org/wiki/List_of_sculptors"

def get_sculptors(url, limit=100):
    # Get the page
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    sculptors = {}
    list_elements = soup.find('div', class_='div-col')
    # Get the list of sculptors
    if list_elements:
        list_sculptors = list_elements.find_all('li')
    
        for idx, sculptor in enumerate(list_sculptors):
            if idx == limit:
                break
            link = sculptor.find('a')
            # Get the name and the link
            if link:
                name = link.text.strip()
                sculptor = link.get('href')
                full_url = f"https://en.wikipedia.org{sculptor}"
                sculptors[name] = full_url

    return sculptors


sculptors = get_sculptors(BASE_URL)

# Save the names to a file for later use
with open("names.txt", "w") as file:
  for i in sculptors:
    file.write(str(i) + "\n")

# print(sculptors)

In [6]:
def get_summaries(title):
    # Define the base URL for the Wikipedia API
    base_url = "https://en.wikipedia.org/w/api.php"

    # Define the parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": title,
        "explaintext": True,  # Return plain text instead of HTML
    }

    # Send a GET request to the Wikipedia API
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the page content from the API response
    try:
        pages = data.get("query", {}).get("pages", {})
        if pages:
            # Retrieve the content of the first page (there should be only one page)
            page_id = list(pages.keys())[0]
            content = pages[page_id].get("extract", "")
            return content
        else:
            return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for : {e}")

In [4]:
if not os.path.exists('data/Sculptors'):
    os.makedirs('data/Sculptors')
data_sculp = []
for name, link in sculptors.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_sculp.append({'Category': 'Sculptors', 'Name': name, 'Content': content})
        filename = f'Sculptors/{name}_sculptor.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [None]:
import pandas as pd
biography = pd.DataFrame(data_sculp)
biography.to_csv('data/biography_sculptor.csv', index=False)

In [7]:
def get_computer_scientists(url, limit=100):
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    computer_scientists = {}
    list_elements = soup.find('div', class_='mw-body-content')
    if list_elements:
        list_computer_scientists = list_elements.find_all('li')
        for idx, computer_scientist in enumerate(list_computer_scientists):
            if idx >= limit:
                break
            link = computer_scientist.find('a')
            if link:
                name = link.text.strip()
                computer_scientist = link.get('href')
                full_url = f"https://en.wikipedia.org{computer_scientist}"
                computer_scientists[name] = full_url

    return computer_scientists

computers = get_computer_scientists('https://en.wikipedia.org/wiki/List_of_computer_scientists')
# sculptors = get_sculptors(BASE_URL)
# with open("names.txt", "a") as file:
#   for i in computers:
#     file.write(str(i) + "\n")
# print(computers)  

In [5]:
if not os.path.exists('data/Computer_Scientists'):
    os.makedirs('data/Computer_Scientists')
data_cs = []
for name, link in computers.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_cs.append({'Category': 'Computer_Scientists', 'Name': name, 'Content': content})
        filename = f'Computer_Scientists/{name}_computerscientist.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [11]:
biography_cs = pd.DataFrame(data_cs)
biography_cs.to_csv('data/biography_cs.csv', index=False)

## Fetch RDF triples

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON


In [None]:
def get_rdf_triples(name):
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    encoded_name = name.replace(' ', '_')
   
    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
      {{
        ?subject ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
      UNION
      {{
        ?object ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
    }}
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        predicate = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': f"http://dbpedia.org/resource/{encoded_name}", 'predicate': predicate, 'object': obj})
    return triples

In [None]:
import json

In [None]:
def save_rdf_triples(names, output_dir):
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in names.keys():
        try:
            triples = get_rdf_triples(name)

            if triples:
                output_file = os.path.join(category_dir, f"{name}.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(triples, f, indent=4)
            else:
                print(f"No RDF triples found for {name}")

        except Exception as e:
            print(f"Error processing {name}: {e}")

In [None]:
save_rdf_triples(sculptors, 'sculptors_rdf')

In [None]:
save_rdf_triples(computers, 'computers_rdf')