In [1]:
import requests
import re
from bs4 import BeautifulSoup
import wikipedia
import wikipediaapi
import os

In [2]:
BASE_URL = "https://en.wikipedia.org/wiki/List_of_sculptors"

def get_sculptors(url):
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    sculptors = {}
    list_elements = soup.find('div', class_='div-col')
    if list_elements:
        list_sculptors = list_elements.find_all('li')
        for sculptor in list_sculptors:
            link = sculptor.find('a')
            if link:
                name = link.text.strip()
                sculptor = link.get('href')
                full_url = f"https://en.wikipedia.org{sculptor}"
                sculptors[name] = full_url

    return sculptors


sculptors = get_sculptors(BASE_URL)
print(sculptors)

{'Wäinö Aaltonen': 'https://en.wikipedia.org/wiki/W%C3%A4in%C3%B6_Aaltonen', 'Johannes Josephus Aarts': 'https://en.wikipedia.org/wiki/Johannes_Josephus_Aarts', 'Magdalena Abakanowicz': 'https://en.wikipedia.org/wiki/Magdalena_Abakanowicz', 'Elfriede Abbe': 'https://en.wikipedia.org/wiki/Elfriede_Abbe', 'Louise Abbéma': 'https://en.wikipedia.org/wiki/Louise_Abb%C3%A9ma', 'Abed Abdi': 'https://en.wikipedia.org/wiki/Abed_Abdi', 'Pablita Abeyta': 'https://en.wikipedia.org/wiki/Pablita_Abeyta', 'Antonio Abondio': 'https://en.wikipedia.org/wiki/Antonio_Abondio', 'Per Abramsen': 'https://en.wikipedia.org/wiki/Per_Abramsen', 'Julio Abril': 'https://en.wikipedia.org/wiki/Julio_Abril', 'Jane Ackroyd': 'https://en.wikipedia.org/wiki/Jane_Ackroyd', 'József Ács': 'https://en.wikipedia.org/wiki/J%C3%B3zsef_%C3%81cs_(sculptor)', 'Lambert-Sigisbert Adam': 'https://en.wikipedia.org/wiki/Lambert-Sigisbert_Adam', 'Nicolas-Sébastien Adam': 'https://en.wikipedia.org/wiki/Nicolas-S%C3%A9bastien_Adam', 'Ali

In [3]:
def get_summaries(title):
    # Define the base URL for the Wikipedia API
    base_url = "https://en.wikipedia.org/w/api.php"

    # Define the parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": title,
        "explaintext": True,  # Return plain text instead of HTML
    }

    # Send a GET request to the Wikipedia API
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the page content from the API response
    try:
        pages = data.get("query", {}).get("pages", {})
        if pages:
            # Retrieve the content of the first page (there should be only one page)
            page_id = list(pages.keys())[0]
            content = pages[page_id].get("extract", "")
            return content
        else:
            return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for : {e}")

In [4]:
if not os.path.exists('Sculptors'):
    os.makedirs('Sculptors')
data_sculp = []
for name, link in sculptors.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_sculp.append({'Category': 'Sculptors', 'Name': name, 'Content': content})
        filename = f'Sculptors/{name}_sculptor.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [5]:
import pandas as pd
biography = pd.DataFrame(data_sculp)
biography.to_csv('biography_sculptor.csv', index=False)

In [30]:
def get_computer_scientists(url):
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    computer_scientists = {}
    list_elements = soup.find('div', class_='mw-body-content')
    if list_elements:
        list_computer_scientists = list_elements.find_all('li')
        for computer_scientist in list_computer_scientists:
            link = computer_scientist.find('a')
            if link:
                name = link.text.strip()
                computer_scientist = link.get('href')
                full_url = f"https://en.wikipedia.org{computer_scientist}"
                computer_scientists[name] = full_url

    return computer_scientists

computers = get_computer_scientists('https://en.wikipedia.org/wiki/List_of_computer_scientists')
# print(computers)  

In [8]:
if not os.path.exists('Computer_Scientists'):
    os.makedirs('Computer_Scientists')
data_cs = []
for name, link in computers.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_cs.append({'Category': 'Computer_Scientists', 'Name': name, 'Content': content})
        filename = f'Computer_Scientists/{name}_computerscientist.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [11]:
biography_cs = pd.DataFrame(data_cs)
biography.to_csv('biography_cs.csv', index=False)

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://en.wikipedia.org")



In [48]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.parse import quote

def get_wikidata_qid(person):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    encode = quote(person)
    query = """
  SELECT DISTINCT ?person WHERE {
  {
    ?person rdfs:label ?label .
    FILTER(CONTAINS(LCASE(?label), LCASE("%s"))) .
  } UNION {
    ?person skos:altLabel ?altLabel .
    FILTER(CONTAINS(LCASE(?altLabel), LCASE("%s"))) .
  }
  ?person wdt:P31 wd:Q5 .  # instance of human
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
LIMIT 1 
""" % (encode, encode)
    # Use person twice for both label and altLabel filters

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    # Check if results exist before returning
    if results["results"]["bindings"]:  
        return results["results"]["bindings"][0]["person"]["value"]
    else:
        return None

In [49]:
def get_rdf_triplets(person):
    qid = get_wikidata_qid(person)
    if not qid:
        print(f"Person '{person}' not found in Wikidata.")
        return []

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
    SELECT ?predicate ?object WHERE {
      <""" + qid + """> ?predicate ?object .
    }
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    triples = []
    for result in results["results"]["bindings"]:
        subj = "<" + qid + ">"
        pred = result["predicate"]["value"]
        obj = result["object"]["value"]
        if result["object"]["type"] == "uri":
            obj = "<" + obj + ">"
        else:
            obj = '"' + obj + '"'
        triples.append({'subject': subj, 'predicate': pred, 'object': obj})
    
    return triples

In [50]:
def save_rdf_triples(category, names, output_dir):
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in names:
        try:
            triplets = get_rdf_triplets(name)

            if not triplets:
                output_file = os.path.join(category_dir, f"{name}.json")
                with open(output_file, 'w') as f:
                    json.dump(triplets, f, indent=4)
            else:
                print(f"No RDF triples found for {name}_{category}")

        except Exception as e:
            print(f"Error processing {name}: {e}")

In [51]:
save_rdf_triples('Sculptors', sculptors, 'sculptors_rdf')

Error processing Wäinö Aaltonen: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n  SELECT DISTINCT ?person WHERE {\n  {\n    ?person rdfs:label ?label .\n    FILTER(CONTAINS(LCASE(?label), LCASE("W%C3%A4in%C3%B6%20Aaltonen"))) .\n  } UNION {\n    ?person skos:altLabel ?altLabel .\n    FILTER(CONTAINS(LCASE(?altLabel), LCASE("W%C3%A4in%C3%B6%20Aaltonen"))) .\n  }\n  ?person wdt:P31 wd:Q5 .  # instance of human\n  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }\n}\nLIMIT 1 \n\njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata

KeyboardInterrupt: 

In [31]:
save_rdf_triples('Computer_Scientits', computers, 'Computer_sc_rdf')


Person 'Atta ur Rehman Khan' not found in Wikidata.
Person 'Wil van der Aalst' not found in Wikidata.
Person 'Scott Aaronson' not found in Wikidata.
Person 'Rediet Abebe' not found in Wikidata.
Person 'Hal Abelson' not found in Wikidata.
Person 'Serge Abiteboul' not found in Wikidata.
Person 'Samson Abramsky' not found in Wikidata.
Person 'Leonard Adleman' not found in Wikidata.
Person 'Manindra Agrawal' not found in Wikidata.
Person 'Luis von Ahn' not found in Wikidata.
Person 'Alfred Aho' not found in Wikidata.
Person 'Frances E. Allen' not found in Wikidata.
Person 'Gene Amdahl' not found in Wikidata.
Person 'David P. Anderson' not found in Wikidata.
Person 'Lisa Anthony' not found in Wikidata.
Person 'Andrew Appel' not found in Wikidata.
Person 'Cecilia R. Aragon' not found in Wikidata.
Person 'Bruce Arden' not found in Wikidata.


KeyboardInterrupt: 