In [1]:
import qwikidata
import requests
import json
import numpy as np
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt

API_endpoint = 'https://www.wikidata.org/w/api.php'
keywords = pd.read_csv('files/keywords.csv')

In [37]:
from collections import deque

# Global cache for keywords to reduce repeated API calls
keyword_cache = {}


def search(query: str):
    global API_endpoint
    params = {
        'action': 'wbsearchentities',
        'format' : 'json',
        'language': 'en',
        'search' : query
    }
    
    r = requests.get(API_endpoint, params = params)
    for i in r.json()['search']:
        yield str(i['id'])


        
def keyword(query: str):
    """Retrieve the label for a Wikidata entity by ID with caching."""
    global API_endpoint, keywords, keyword_cache
    
    # Check in local cache
    if query in keyword_cache:
        return keyword_cache[query]
    
    # Check in keywords DataFrame
    if query in set(keywords['id']):
        result = keywords.loc[keywords['id'] == query, 'name'].values[0]
        keyword_cache[query] = result
        return result
    
    # Fetch from API
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'language': 'en',
        'ids': query
    }
    response = requests.get(API_endpoint, params=params)
    output = response.json()['entities'][str(query)]["labels"]['en']['value']
    
    # Update cache and keywords DataFrame
    keyword_cache[query] = output
    new_df = pd.DataFrame({'id': [query], 'name': [output]})
    keywords = pd.concat([keywords, new_df], ignore_index=True)
    keywords.to_csv('files/keywords.csv', index=False)
    
    return output

def family(person):
    """Retrieve family details for a person."""
    global API_endpoint
    entity_id = next(search(person))
    
    entity_params = {
        'action': 'wbgetentities',
        'format': 'json',
        'language': 'en',
        'ids': entity_id
    }
    
    response = requests.get(API_endpoint, params=entity_params)
    entity_data = response.json()
    claims = entity_data['entities'][entity_id]['claims']
    
    # Pre-fetch keywords for occupations and citizenships
    occupation_ids = [claim['mainsnak']['datavalue']['value']['id'] for claim in claims.get('P106', [])]
    citizenship_ids = [claim['mainsnak']['datavalue']['value']['id'] for claim in claims.get('P27', [])]
    
    occupation = [keyword(i) for i in occupation_ids]
    citizenships = [keyword(i) for i in citizenship_ids]
    
    # Retrieve personal information with defaults for missing fields
    personal_info = {
        "name": entity_data['entities'][entity_id]['labels'].get('en', {}).get('value', []),
        "id": entity_id,
        "birth_date": claims.get('P569', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('time', "Unknown"),
        "birth_place": claims.get('P19', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', "Unknown"),
        "death_date": claims.get('P570', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('time', None),
        "death_place": keyword(claims.get('P20', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', "Unknown")) if claims.get('P20') else "Unknown",
        "gender": claims.get('P21', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id'),
        "spouses": [claim['mainsnak']['datavalue']['value']['id'] for claim in claims.get('P26', [])],
        "children": [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id', {}) for claim in claims.get('P40', {})],
        "father": claims.get('P22', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id'),
        "mother": claims.get('P25', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id'),
    }
    
    return personal_info

def neighbours(info):
    """Retrieve family connections from personal info."""
    return list(filter(None, info['spouses'] + info['children'] + [info['father'], info['mother']]))


def BFS(start_id, max_depth=2):
    """
    Breadth-first search to explore family relationships with a specified depth limit.
    
    Args:
        start_id (str): The ID of the starting person.
        max_depth (int): Maximum depth to explore (default: infinite).
    
    Returns:
        dict: A dictionary where each key is a person ID, and the value is their personal info,
              including the distance from the starting node.
    """
    visited = set()
    queue = deque([(start_id, 0)])  # Queue stores (current_node, depth)
    output = {}
    
    while queue:
        current_node, depth = queue.popleft()
        #print(f"Processing node {current_node} at depth {depth}")
        
        if current_node in visited or depth > max_depth:
            continue
        
        visited.add(current_node)
        person_info = family(current_node)
        person_info["distance_from_start"] = depth  # Annotate with depth
        output[current_node] = person_info
        
        # Add neighbors to the queue with incremented depth
        for neighbor in neighbours(person_info):
            if neighbor not in visited:
                queue.append((neighbor, depth + 1))
    
    return output


bfs = BFS('Q317521') 


In [53]:
import pandas as pd
import networkx as nx

# Assuming you already have the BFS result in `bfs_output` as a dictionary of people's information.

def generate_family_table(bfs_output):
    """
    Generates a table from BFS output containing family relationships and personal details.
    
    Args:
        bfs_output (dict): Output from the BFS function with personal info.
    
    Returns:
        pd.DataFrame: A table with the family data.
    """
    data = []  # List to hold the table rows
    
    # Iterate over each person in bfs_output
    for person_id, info in bfs_output.items():
        # Extract personal information
        name = info.get('name', {})
        gender = info.get('gender', 'Unknown')

        
        # Extract relationships
        father_id = info.get('father', '')
        mother_id = info.get('mother', '')
        spouses = info.get('spouses', [])
        
        # Create family ID (FID) and Mother ID (MID) columns
        fid = father_id if father_id else ''
        mid = mother_id if mother_id else ''
        
        # Format the row for this person
        row = [
            person_id,  # ID
            'M' if gender == 'Q6581097' else 'F',  # S (gender M/F)
            name,  # Name
            fid,  # Father ID
            mid,  # Mother ID

        ]
        
        # Add the row to the table
        data.append(row)
        
        # Add spouse relationships (each spouse has a new row in the table)
        for spouse in spouses:
            # Assuming spouse data is available and structured similarly
            spouse_info = bfs_output.get(spouse, {})
            spouse_row = [
                spouse,  # Spouse ID
                'M' if spouse_info.get('gender') == 'Q6581097' else 'F',  # S (gender M/F)
                spouse_info.get('name', {}),
                spouse_info.get('father', ''),
                spouse_info.get('mother', ''),

            ]
            data.append(spouse_row)
    
    # Create a pandas DataFrame for the table
    columns = [
        'ID', 'S', 'Name', 'FID', 'MID'
    ]
    
    df = pd.DataFrame(data, columns=columns)
    return df

def tree(ID):
    ID = next(search('Elon musk'))
    bfs = BFS(ID) 
    rawdf = generate_family_table(bfs)


    el1 = rawdf[['ID','MID']]
    el2 = rawdf[['ID','FID']]
    el1.columns = ['Child', 'ParentID']
    el2.columns = el1.columns
    el = pd.concat([el1, el2])
    el.replace('', np.nan, regex=True, inplace = True)
    t = pd.DataFrame({'tmp':['no_entry'+str(i) for i in range(el.shape[0])]})
    el['ParentID'].fillna(t['tmp'], inplace=True)
    df = el.merge(rawdf, left_index=True, right_index=True, how='left')
    df = df.drop(['Child','FID', 'MID'], axis=1)
    df = df[['ID', 'Name', 'S',  'ParentID']]

    from graphviz import Digraph
    f = Digraph('neato', format='jpg', encoding='utf8', filename='corleone', 
                node_attr={'style': 'filled'},  
                graph_attr={"concentrate": "true", "splines":"ortho"})
    f.attr('node', shape='box')

    for index, row in df.iterrows():
        # Handle cases where 'Name' might be a dictionary or complex object
        name = row['Name']
        if isinstance(name, dict):  # Extract string value from the dictionary
            name = keyword(df['ID'].iloc[index])  # Replace 'value' with the actual key holding the name, if necessary
        elif not isinstance(name, str):  # Fallback in case it's not a string
            name = str(name)

        f.node(row['ID'],
            label=name,
            _attributes={'color': 'lightpink' if row['S'] == 'F' else 'lightblue' if row['S'] == 'M' else 'lightgray'})
    


    # Add edges, skipping no_entry ParentID
    for index, row in df.iterrows():
        if not row["ParentID"].startswith("no_entry"):  # Skip edges with no_entry as ParentID
            f.edge(str(row["ParentID"]), str(row["ID"]), label='')  

    f.view()



tree('Elon musk')

In [39]:
#f = Digraph('neato', format='jpg', encoding='utf8', filename='corleone', node_attr={'style': 'filled'},  graph_attr={"concentrate": "true", "splines":"ortho"})
#f.attr('node', shape='box')
#for index, row in df.iterrows():
#    f.node(row['ID'],
#           label=
#             row['Name'],
#           _attributes={'color':'lightpink' if row['S']=='F' else 'lightblue'if row['S']=='M' else 'lightgray'})
#for index, row in df.iterrows():
#    f.edge(str(row["ParentID"]), str(row["ID"]), label='')  
#f.view()