In [1]:
import requests
from difflib import SequenceMatcher
from typing import Optional, List, Dict, Tuple
import time
from urllib.parse import quote
import spacy
from spellchecker import SpellChecker
import rdflib
import re

In [11]:
class WikidataFinder:
    """
    A class to convert natural language strings to Wikidata URIs using fuzzy matching.
    """
    
    BASE_URL = "https://www.wikidata.org/w/api.php"
    WIKIDATA_URI_PREFIX = "http://www.wikidata.org/entity/"


    def extract_wikidata_id(self, uri: str) -> str:
        """
        Extract Wikidata ID (Q or P number) from a URI or direct ID string.
        
        Args:
            uri (str): Wikidata URI or ID (e.g., 'http://www.wikidata.org/entity/Q42', 
                    'http://www.wikidata.org/prop/direct/P495', 'Q42', or 'P495')
        
        Returns:
            str: The extracted Wikidata ID
            
        Raises:
            ValueError: If the URI doesn't contain a valid Wikidata ID
        """
        # Check if it's already just a Q/P number
        if re.match(r'^[QP]\d+$', uri):
            return uri
        
        # Extract from URI
        match = re.search(r'[QP]\d+', uri)
        if match:
            return match.group(0)
        
        raise ValueError(f"Invalid Wikidata identifier: {uri}")
    
    def get_wikidata_label(self, uri: str) -> str:
        """
        Convert a Wikidata URI to human readable text by querying the Wikidata API.
        Works for both entities (Q) and properties (P).
        
        Args:
            uri (str): Wikidata URI or ID (e.g., 'http://www.wikidata.org/entity/Q42', 
                    'http://www.wikidata.org/prop/direct/P495', 'Q42', or 'P495')
            
        Returns:
            str: Human readable label for the entity or property
            
        Raises:
            ValueError: If the URI is invalid
            RequestException: If the API request fails
        """
        # Extract the string from rdflib.term.URIRef
        uri = str(uri)
        try:
            entity_id = self.extract_wikidata_id(uri)
        except ValueError as e:
            return str(e)
        
        # Determine if it's a property or entity
        is_property = entity_id.startswith('P')
        
        # Construct the API URL
        api_url = 'https://www.wikidata.org/w/api.php'
        
        if is_property:
            # For properties, we need to use the wbgetentities API with different parameters
            params = {
                'action': 'wbgetentities',
                'ids': entity_id,
                'format': 'json',
                'props': 'labels|datatype',  # Include datatype for properties
                'languages': 'en'
            }
        else:
            # For regular entities
            params = {
                'action': 'wbgetentities',
                'ids': entity_id,
                'format': 'json',
                'props': 'labels',
                'languages': 'en'
            }
        
        try:
            response = requests.get(api_url, params=params)
            response.raise_for_status()
            data = response.json()
            
            # Check if entity/property exists
            if 'entities' not in data or entity_id not in data['entities']:
                return f"ID {entity_id} not found"
                
            # Get English label if available
            entity = data['entities'][entity_id]
            label = None
            
            if 'labels' in entity and 'en' in entity['labels']:
                label = entity['labels']['en']['value']
                
                # For properties, append the datatype if available
                if is_property and 'datatype' in entity:
                    label = f"{label} ({entity['datatype']})"
                    
                return label
            else:
                return f"No English label found for {entity_id}"
                
        except requests.exceptions.RequestException as e:
            raise requests.exceptions.RequestException(f"Failed to fetch data: {str(e)}")
    
    def __init__(self, language: str = "en", max_results: int = 5, similarity_threshold: float = 0.6):
        """
        Initialize the WikidataFinder.
        
        Args:
            language: Language code for search results (default: "en")
            max_results: Maximum number of search results to consider (default: 5)
            similarity_threshold: Minimum similarity score to consider a match (default: 0.6)
        """
        self.language = language
        self.max_results = max_results
        self.similarity_threshold = similarity_threshold
        
    def search_wikidata(self, query: str) -> List[Dict]:
        """
        Search Wikidata API for entities matching the query.
        """
        params = {
            "action": "wbsearchentities",
            "format": "json",
            "language": self.language,
            "search": query,
            "limit": self.max_results,
        }
        
        try:
            response = requests.get(self.BASE_URL, params=params)
            response.raise_for_status()
            data = response.json()
            return data.get("search", [])
        except requests.exceptions.RequestException as e:
            print(f"Error searching Wikidata: {e}")
            return []
            
    def calculate_similarity(self, query: str, target: str) -> float:
        """
        Calculate string similarity between query and target using SequenceMatcher.
        """
        return SequenceMatcher(None, query.lower(), target.lower()).ratio()
        
    def get_best_match(self, query: str, search_results: List[Dict]) -> Optional[Tuple[str, float]]:
        """
        Find the best matching entity from search results based on similarity score.
        """
        best_match = None
        highest_score = 0
        
        for result in search_results:
            # Check similarity with label and description
            label_score = self.calculate_similarity(query, result.get("label", ""))
            desc_score = self.calculate_similarity(query, result.get("description", "")) * 0.5
            
            # Combined score with more weight on label match
            combined_score = max(label_score, desc_score)
            
            if combined_score > highest_score and combined_score >= self.similarity_threshold:
                highest_score = combined_score
                best_match = (result.get("id"), combined_score)
                
        return best_match
        
    def get_wikidata_uri(self, query: str) -> Optional[Dict]:
        """
        Convert a natural language string to a Wikidata URI.
        
        Args:
            query: Natural language string to search for
            
        Returns:
            Dictionary containing:
                - uri: Wikidata URI if found
                - confidence: Similarity score
                - label: Entity label
                - description: Entity description
            Returns None if no match is found
        """
        if not query.strip():
            return None
            
        # Rate limiting
        time.sleep(0.1)
        
        # Search Wikidata
        results = self.search_wikidata(query)
        
        if not results:
            return None
            
        # Find best match
        best_match = self.get_best_match(query, results)
        
        if not best_match:
            return None
            
        entity_id, confidence = best_match
        
        # Get the matching result details
        match_details = next((r for r in results if r["id"] == entity_id), None)
        
        return {
            "uri": f"{self.WIKIDATA_URI_PREFIX}{entity_id}",
            "confidence": round(confidence, 3),
            "label": match_details.get("label"),
            "description": match_details.get("description")
        }


In [12]:
import pickle

# Load the pickle file
with open('triples.pkl', 'rb') as file:
    data = pickle.load(file)

In [13]:
subject_predicate_to_object, predicate_object_to_subject, subject_object_to_predicate = data

In [14]:
# Example usage
subject_object_to_predicate

{(rdflib.term.URIRef('http://www.wikidata.org/entity/Q22674394'),
  rdflib.term.URIRef('http://www.wikidata.org/entity/Q11424')): [rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P31')],
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q516483'),
  rdflib.term.URIRef('http://www.wikidata.org/entity/Q16')): [rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P27')],
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q4196443'),
  rdflib.term.URIRef('http://www.wikidata.org/entity/Q580231')): [rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P166')],
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q3335591'),
  rdflib.term.URIRef('http://www.wikidata.org/entity/Q376749')): [rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P19')],
 (rdflib.term.URIRef('http://www.wikidata.org/entity/Q1420651'),
  rdflib.term.URIRef('http://www.wikidata.org/entity/Q20644797')): [rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1981')],
 (rdflib.term.URIRef('htt

In [15]:
finder = WikidataFinder()

result = finder.get_wikidata_uri("cast")
if result:
    print(f"URI: {result['uri']}")
else:
    print(f"\nNo match found for")

URI: http://www.wikidata.org/entity/Q37135831


In [16]:
from transformers import pipeline

# Load the pre-trained NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [17]:
def extract_ner(sentence):
    # Perform Named Entity Recognition
    ner_results = ner_pipeline(sentence)

    # Extract and print the entity names
    entities = [entity['word'] for entity in ner_results if entity['entity_group'] in ['PER', 'LOC', 'ORG', 'MISC']]
    return entities

extract_ner("Steven Spielberg is a famous director who was born in Cincinnati, Ohio.")

['Steven Spielberg', 'Cincinnati', 'Ohio']

In [18]:
common_movie_related_entities = ['director', 'actor', 'actress', 'actors', 'actresses', 'cast', 'revenue', 'released'
                                 'screenwriter', 'producer', 'production', 'genre', 'plot', 'rating', 'awards', 'budget', 'box office'
                                 'film', 'movie', 'citizenship', 'birth', 'birthplace', 'death', 'deathplace', 'national']

In [24]:
def query_dictionary(uri_1, uri_2):
    uri_1 = rdflib.term.URIRef(uri_1)
    uri_2 = rdflib.term.URIRef(uri_2)
    
    if((uri_1, uri_2) in subject_predicate_to_object):
        return finder.get_wikidata_label(subject_predicate_to_object[(uri_1, uri_2)])
    elif((uri_1, uri_2) in subject_object_to_predicate):
        return finder.get_wikidata_label(subject_object_to_predicate[(uri_1, uri_2)])
    elif((uri_1, uri_2) in predicate_object_to_subject):
        return finder.get_wikidata_label(predicate_object_to_subject[(uri_1, uri_2)])
    else:
        uri_1, uri_2 = uri_2, uri_1
        if((uri_1, uri_2) in subject_predicate_to_object):
            return finder.get_wikidata_label(subject_predicate_to_object[(uri_1, uri_2)])
        elif((uri_1, uri_2) in subject_object_to_predicate):
            return finder.get_wikidata_label(subject_object_to_predicate[(uri_1, uri_2)])
        elif((uri_1, uri_2) in predicate_object_to_subject):
            return finder.get_wikidata_label(predicate_object_to_subject[(uri_1, uri_2)])
        else:
            return "No relationship found between the two entities."

def answer_query(query):
    # Extract named entities from the query
    entities = extract_ner(query)
    if not entities:
        return "No entities found in the query."
    if(len(entities) == 1):
        # check if any word in query is in common_movie_related_entities
        for word in query.split():
            # Remove all punctuation and special characters
            word = ''.join(e for e in word if e.isalnum())
            if word in common_movie_related_entities:
                uri_1 = finder.get_wikidata_uri(entities[0])['uri']
                uri_2 = finder.get_wikidata_uri(word)
                if uri_2:
                    uri_2 = uri_2['uri']
                    ans = query_dictionary(uri_1, uri_2)
                return ans

    if len(entities) == 2:
        uri_1 = finder.get_wikidata_uri(entities[0])['uri']
        uri_2 = finder.get_wikidata_uri(entities[1])['uri']
        return query_dictionary(uri_1, uri_2)
        
query = "What is the relationship between Peter MacNeill and Canada?"
print("I believe that the answer to that question is: ", answer_query(query))


country of citizenship (wikibase-item)
