In [None]:
import pickle
from rdflib import Graph
import requests
from typing import Union, Dict, List
import re

In [None]:
g = Graph()
g.parse("14_graph.nt", format="turtle")

In [10]:
def extract_wikidata_id(uri: str) -> str:
    """
    Extract Wikidata ID (Q or P number) from a URI or direct ID string.
    
    Args:
        uri (str): Wikidata URI or ID (e.g., 'http://www.wikidata.org/entity/Q42', 
                  'http://www.wikidata.org/prop/direct/P495', 'Q42', or 'P495')
    
    Returns:
        str: The extracted Wikidata ID
        
    Raises:
        ValueError: If the URI doesn't contain a valid Wikidata ID
    """
    # Check if it's already just a Q/P number
    if re.match(r'^[QP]\d+$', uri):
        return uri
    
    # Extract from URI
    match = re.search(r'[QP]\d+', uri)
    if match:
        return match.group(0)
    
    raise ValueError(f"Invalid Wikidata identifier: {uri}")

def get_wikidata_label(uri: str) -> str:
    """
    Convert a Wikidata URI to human readable text by querying the Wikidata API.
    Works for both entities (Q) and properties (P).
    
    Args:
        uri (str): Wikidata URI or ID (e.g., 'http://www.wikidata.org/entity/Q42', 
                  'http://www.wikidata.org/prop/direct/P495', 'Q42', or 'P495')
        
    Returns:
        str: Human readable label for the entity or property
        
    Raises:
        ValueError: If the URI is invalid
        RequestException: If the API request fails
    """
    try:
        entity_id = extract_wikidata_id(uri)
    except ValueError as e:
        return str(e)
    
    # Determine if it's a property or entity
    is_property = entity_id.startswith('P')
    
    # Construct the API URL
    api_url = 'https://www.wikidata.org/w/api.php'
    
    if is_property:
        # For properties, we need to use the wbgetentities API with different parameters
        params = {
            'action': 'wbgetentities',
            'ids': entity_id,
            'format': 'json',
            'props': 'labels|datatype',  # Include datatype for properties
            'languages': 'en'
        }
    else:
        # For regular entities
        params = {
            'action': 'wbgetentities',
            'ids': entity_id,
            'format': 'json',
            'props': 'labels',
            'languages': 'en'
        }
    
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        data = response.json()
        
        # Check if entity/property exists
        if 'entities' not in data or entity_id not in data['entities']:
            return f"ID {entity_id} not found"
            
        # Get English label if available
        entity = data['entities'][entity_id]
        label = None
        
        if 'labels' in entity and 'en' in entity['labels']:
            label = entity['labels']['en']['value']
            
            # For properties, append the datatype if available
            if is_property and 'datatype' in entity:
                label = f"{label} ({entity['datatype']})"
                
            return label
        else:
            return f"No English label found for {entity_id}"
            
    except requests.exceptions.RequestException as e:
        raise requests.exceptions.RequestException(f"Failed to fetch data: {str(e)}")

def batch_convert_uris(uris: List[str]) -> Dict[str, str]:
    """
    Convert multiple Wikidata URIs to human readable text in a single batch request.
    Works for both entities (Q) and properties (P).
    
    Args:
        uris (List[str]): List of Wikidata URIs or IDs
        
    Returns:
        Dict[str, str]: Dictionary mapping URIs to their human readable labels
    """
    # Extract IDs and create a mapping
    id_mapping = {}
    q_ids = []
    p_ids = []
    
    for uri in uris:
        try:
            wikidata_id = extract_wikidata_id(uri)
            id_mapping[uri] = wikidata_id
            
            if wikidata_id.startswith('Q'):
                q_ids.append(wikidata_id)
            else:  # P ids
                p_ids.append(wikidata_id)
        except ValueError as e:
            id_mapping[uri] = str(e)
    
    results = {}
    
    # Process both Q and P IDs in separate batches
    for id_list in [q_ids, p_ids]:
        if not id_list:
            continue
            
        # Process in batches of 50
        for i in range(0, len(id_list), 50):
            batch = id_list[i:i + 50]
            
            api_url = 'https://www.wikidata.org/w/api.php'
            params = {
                'action': 'wbgetentities',
                'ids': '|'.join(batch),
                'format': 'json',
                'props': 'labels|datatype',  # Include datatype for properties
                'languages': 'en'
            }
            
            try:
                response = requests.get(api_url, params=params)
                response.raise_for_status()
                data = response.json()
                
                for wikidata_id, entity_data in data['entities'].items():
                    if 'labels' in entity_data and 'en' in entity_data['labels']:
                        label = entity_data['labels']['en']['value']
                        
                        # Add datatype for properties
                        if wikidata_id.startswith('P') and 'datatype' in entity_data:
                            label = f"{label} ({entity_data['datatype']})"
                        
                        # Find original URI(s) for this ID
                        for uri, mapped_id in id_mapping.items():
                            if mapped_id == wikidata_id:
                                results[uri] = label
                    else:
                        # Handle missing labels
                        for uri, mapped_id in id_mapping.items():
                            if mapped_id == wikidata_id:
                                results[uri] = f"No English label found for {wikidata_id}"
                                
            except requests.exceptions.RequestException as e:
                # On error, mark all URIs in this batch as failed
                for uri, mapped_id in id_mapping.items():
                    if mapped_id in batch:
                        results[uri] = f"Failed to fetch: {str(e)}"
    
    # Add any URIs that had invalid IDs to the results
    for uri, mapped_id in id_mapping.items():
        if uri not in results:
            results[uri] = mapped_id  # This will contain the error message
    
    return results


{'http://www.wikidata.org/entity/Q850522': 'Little Women', 'http://www.wikidata.org/entity/Q4717778': 'Alex Shaffer'}


In [None]:
# Initialize dictionaries to store triples
subject_predicate_to_object = {}
predicate_object_to_subject = {}
subject_object_to_predicate = {}

uri_to_label = {}
label_to_uri = {}

# Parse the graph and populate the dictionaries
# THis should be able to handle multiple values for the same key
for s, p, o in g:
    if (s, p) not in subject_predicate_to_object:
        subject_predicate_to_object[(s, p)] = []
    subject_predicate_to_object[(s, p)].append(o)
    if (p, o) not in predicate_object_to_subject:
        predicate_object_to_subject[(p, o)] = []
    predicate_object_to_subject[(p, o)].append(s)
    if (s, o) not in subject_object_to_predicate:
        subject_object_to_predicate[(s, o)] = []
    subject_object_to_predicate[(s, o)].append(p)

    # Convert URIs to human readable labels
    uri_to_label[s] = get_wikidata_label(s)
    uri_to_label[p] = get_wikidata_label(p)
    uri_to_label[o] = get_wikidata_label(o)

    # Add reverse mapping
    label_to_uri[uri_to_label[s]] = s
    label_to_uri[uri_to_label[p]] = p
    label_to_uri[uri_to_label[o]] = o


# Save the dictionaries to a file
with open('triples.pkl', 'wb') as f:
    pickle.dump((subject_predicate_to_object, predicate_object_to_subject, subject_object_to_predicate, label_to_uri, uri_to_label), f)


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x103928700>>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/atai/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [1]:
# Get object given subject and predicate
def get_object(subject, predicate):
    return subject_predicate_to_object.get((subject, predicate))

# Get subject given predicate and object
def get_subject(predicate, obj):
    return predicate_object_to_subject.get((predicate, obj))

# Get predicate given subject and object
def get_predicate(subject, obj):
    return subject_object_to_predicate.get((subject, obj))