In [11]:
import requests
import re
from bs4 import BeautifulSoup
import wikipedia
import wikipediaapi
import os

In [12]:
BASE_URL = "https://en.wikipedia.org/wiki/List_of_sculptors"

def get_sculptors(url):
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    sculptors = {}
    list_elements = soup.find('div', class_='div-col')
    if list_elements:
        list_sculptors = list_elements.find_all('li')
        for sculptor in list_sculptors:
            link = sculptor.find('a')
            if link:
                name = link.text.strip()
                sculptor = link.get('href')
                full_url = f"https://en.wikipedia.org{sculptor}"
                sculptors[name] = full_url

    return sculptors


sculptors = get_sculptors(BASE_URL)
print(sculptors)

{'Wäinö Aaltonen': 'https://en.wikipedia.org/wiki/W%C3%A4in%C3%B6_Aaltonen', 'Johannes Josephus Aarts': 'https://en.wikipedia.org/wiki/Johannes_Josephus_Aarts', 'Magdalena Abakanowicz': 'https://en.wikipedia.org/wiki/Magdalena_Abakanowicz', 'Elfriede Abbe': 'https://en.wikipedia.org/wiki/Elfriede_Abbe', 'Louise Abbéma': 'https://en.wikipedia.org/wiki/Louise_Abb%C3%A9ma', 'Abed Abdi': 'https://en.wikipedia.org/wiki/Abed_Abdi', 'Pablita Abeyta': 'https://en.wikipedia.org/wiki/Pablita_Abeyta', 'Antonio Abondio': 'https://en.wikipedia.org/wiki/Antonio_Abondio', 'Per Abramsen': 'https://en.wikipedia.org/wiki/Per_Abramsen', 'Julio Abril': 'https://en.wikipedia.org/wiki/Julio_Abril', 'Jane Ackroyd': 'https://en.wikipedia.org/wiki/Jane_Ackroyd', 'József Ács': 'https://en.wikipedia.org/wiki/J%C3%B3zsef_%C3%81cs_(sculptor)', 'Lambert-Sigisbert Adam': 'https://en.wikipedia.org/wiki/Lambert-Sigisbert_Adam', 'Nicolas-Sébastien Adam': 'https://en.wikipedia.org/wiki/Nicolas-S%C3%A9bastien_Adam', 'Ali

In [5]:
def get_summaries(title):
    # Define the base URL for the Wikipedia API
    base_url = "https://en.wikipedia.org/w/api.php"

    # Define the parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": title,
        "explaintext": True,  # Return plain text instead of HTML
    }

    # Send a GET request to the Wikipedia API
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the page content from the API response
    try:
        pages = data.get("query", {}).get("pages", {})
        if pages:
            # Retrieve the content of the first page (there should be only one page)
            page_id = list(pages.keys())[0]
            content = pages[page_id].get("extract", "")
            return content
        else:
            return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"DisambiguationError for : {e}")

In [12]:
if not os.path.exists('Sculptors'):
    os.makedirs('Sculptors')
data_sculp = []
for name, link in sculptors.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_sculp.append({'Category': 'Sculptors', 'Name': name, 'Content': content})
        filename = f'Sculptors/{name}_sculptor.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [13]:
import pandas as pd
biography = pd.DataFrame(data_sculp)
biography.to_csv('biography_sculptor.csv', index=False)

In [13]:
def get_computer_scientists(url):
    response = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    computer_scientists = {}
    list_elements = soup.find('div', class_='mw-body-content')
    if list_elements:
        list_computer_scientists = list_elements.find_all('li')
        for computer_scientist in list_computer_scientists:
            link = computer_scientist.find('a')
            if link:
                name = link.text.strip()
                computer_scientist = link.get('href')
                full_url = f"https://en.wikipedia.org{computer_scientist}"
                computer_scientists[name] = full_url

    return computer_scientists

computers = get_computer_scientists('https://en.wikipedia.org/wiki/List_of_computer_scientists')
print(computers)  

{'Atta ur Rehman Khan': 'https://en.wikipedia.org/wiki/Atta_ur_Rehman_Khan', 'Wil van der Aalst': 'https://en.wikipedia.org/wiki/Wil_van_der_Aalst', 'Scott Aaronson': 'https://en.wikipedia.org/wiki/Scott_Aaronson', 'Rediet Abebe': 'https://en.wikipedia.org/wiki/Rediet_Abebe', 'Hal Abelson': 'https://en.wikipedia.org/wiki/Hal_Abelson', 'Serge Abiteboul': 'https://en.wikipedia.org/wiki/Serge_Abiteboul', 'Samson Abramsky': 'https://en.wikipedia.org/wiki/Samson_Abramsky', 'Leonard Adleman': 'https://en.wikipedia.org/wiki/Leonard_Adleman', 'Manindra Agrawal': 'https://en.wikipedia.org/wiki/Manindra_Agrawal', 'Luis von Ahn': 'https://en.wikipedia.org/wiki/Luis_von_Ahn', 'Alfred Aho': 'https://en.wikipedia.org/wiki/Alfred_Aho', 'Frances E. Allen': 'https://en.wikipedia.org/wiki/Frances_E._Allen', 'Gene Amdahl': 'https://en.wikipedia.org/wiki/Gene_Amdahl', 'David P. Anderson': 'https://en.wikipedia.org/wiki/David_P._Anderson', 'Lisa Anthony': 'https://en.wikipedia.org/wiki/Lisa_Anthony', 'Andr

In [8]:
if not os.path.exists('Computer_Scientists'):
    os.makedirs('Computer_Scientists')
data_cs = []
for name, link in computers.items():
    content  = get_summaries(name)
    if content:
        # Append data to list
        data_cs.append({'Category': 'Computer_Scientists', 'Name': name, 'Content': content})
        filename = f'Computer_Scientists/{name}_computerscientist.txt'
        # Save the content to a text file
        with open(filename, 'w') as file:
            file.write(content)

In [11]:
biography_cs = pd.DataFrame(data_cs)
biography.to_csv('biography_cs.csv', index=False)

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://en.wikipedia.org")



In [4]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON

def extract_category_facts(category_name, output_file):
    # Initialize SPARQL endpoint
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    
    # Construct the SPARQL query to retrieve RDF triples related to the category
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX dcterms: <http://purl.org/dc/terms/>
    SELECT ?subject ?predicate ?object
    WHERE {
        ?subject rdf:type ?category .
        ?subject ?predicate ?object .
        FILTER regex(str(?subject), "http://dbpedia.org/resource/") .
        FILTER regex(str(?subject), "%s", "i") .
    }
    LIMIT 100
    """ % category_name
    
    # Set the SPARQL query and return format
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the SPARQL query
    results = sparql.query().convert()
    
    # Extract and store RDF triples in a list
    triples = []
    for result in results["results"]["bindings"]:
        subject = result["subject"]["value"]
        predicate = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({"subject": subject, "predicate": predicate, "object": obj})
    
    # Save RDF triples to a JSON file
    with open(output_file, 'w') as f:
        json.dump(triples, f, indent=4)

# Example usage:
category_name = "Category:Programming_languages"
output_file = "category_facts.json"
extract_category_facts(category_name, output_file)


In [27]:
from urllib.parse import quote

In [31]:
def get_rdf_triplets(person):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    encode_name = quote(person.replace('', '_'))
    dbpedia_resource = "http://dbpedia.org/resource/" + encode_name
    query = "SELECT ?predicate ?object WHERE { <" + dbpedia_resource + "> ?predicate ?object.}"
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        subj = "<" + dbpedia_resource + ">"
        pred = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': subj, 'predicate': pred, 'object': obj})
    return triples

In [44]:
import logging

In [46]:
logging.basicConfig(filename='rdf_extraction.log', level=logging.INFO)
def save_rdf_triple(category, names, output_dir):
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in names:
        try:
            triplets = get_rdf_triplets(name)

            if not triplets:
                logging.warning(f"No triples found for {name}. Skipping...")
                continue
            output_file = os.path.join(category_dir, f"{name}.json")
            with open(output_file, 'w') as f:
                json.dump(triplets, f, indent=4)
        except Exception as e:
            print(f"Error processing {name}: {e}")

In [47]:
save_rdf_triple('Sculptors', sculptors, 'sculptors_rdf')

In [41]:
save_rdf_triple('Computer_Scientits', computers, 'Computer_sc_rdf')


KeyboardInterrupt: 