In [None]:
import pandas as pd
from yfiles_jupyter_graphs import GraphWidget
from IPython.display import display

INPUT_DIR = "your input directory here" 
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
COMMUNITY_REPORT_TABLE = "community_reports"
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
community_report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

In [None]:
def find_related_communities(keyword):
    related_reports = community_report_df[
        community_report_df['title'].str.contains(keyword, case=False, na=False)
    ]
    return related_reports['human_readable_id'].tolist()

# community_hr_ids = find_related_communities("Palladia Limited")
# print(f"Found {len(community_hr_ids)} communities: {community_hr_ids}")

Found 3 communities: [208, 76, 16]


In [None]:
# Function to get data for a specific community
def get_community_data(community_id):
    # Find the community in community_df based on the community ID
    community_row = community_df[community_df['community'] == community_id]
    if len(community_row) == 0:
        print(f"No data found for community {community_id}.")
        return None, None, None, None
    
    # Get entity_ids and relationship_ids for this community
    entity_ids = set(community_row['entity_ids'].iloc[0])  
    relationship_ids = set(community_row['relationship_ids'].iloc[0]) 
    
    # Get all related entities and relationships, including those referenced in relationships
    relationships = relationship_df[relationship_df['id'].isin(relationship_ids)]
    
    # Collect all entity IDs from relationships (source and target)
    all_entity_ids = set()
    for _, rel in relationships.iterrows():
        all_entity_ids.add(rel['source'])
        all_entity_ids.add(rel['target'])
    
    # Combine community entity_ids with those from relationships
    entity_ids.update(all_entity_ids)
    
    # Filter entities, ensuring all related entities are included
    entities = entity_df[entity_df['id'].isin(entity_ids) | entity_df['human_readable_id'].isin(entity_ids) | entity_df['title'].isin(entity_ids)]
    
    # Create community mapping
    community_mapping = {entity_id: community_id for entity_id in entity_ids}
    
    # Create a mapping of IDs (human_readable_id, title, and id) for edge mapping
    entity_id_map = {}
    for _, entity in entities.iterrows():
        # Map human_readable_id if available, otherwise use title, then id
        entity_id_map[entity['human_readable_id']] = entity['id'] if pd.notna(entity['human_readable_id']) else None
        entity_id_map[entity['title']] = entity['id'] if pd.notna(entity['title']) else None
        entity_id_map[entity['id']] = entity['id']
    
    return entities, relationships, community_mapping, entity_id_map

# Function to assign colors based on community
def get_color(community_id, community_ids):
    colors = [
        '#FF5733', '#33FF57', '#3357FF', '#FF33A1', '#A133FF',
        '#33FFF5', '#FFC107', '#8BC34A', '#FF9800', '#9C27B0'
    ]
    index = community_ids.index(community_id) % len(colors)
    return colors[index]

# Function to get title for an ID
def get_title_for_id(entity_id, entities, entity_id_map):
    if entity_id in entity_id_map:
        mapped_id = entity_id_map[entity_id]
        if mapped_id in entities['id'].values:
            entity = entities[entities['id'] == mapped_id]
            if not entity.empty:
                title = entity['title'].iloc[0] if pd.notna(entity['title'].iloc[0]) else entity['human_readable_id'].iloc[0]
                return title if pd.notna(title) else str(entity_id)
    return str(entity_id)

# Function to create the graph
def create_graph(entities, relationships, community_mapping, entity_id_map):
    w = GraphWidget()
    
    # Get unique community IDs
    unique_communities = list(set(community_mapping.values()))
    
    # Add nodes (entities) with descriptions
    nodes = []
    for _, entity in entities.iterrows():
        community_id = community_mapping.get(entity['id'], -1)
        node = {
            'id': entity['id'],
            'properties': {
                'label': entity['title'],
                'type': entity['type'],
                'degree': entity['degree'],
                'community': community_id,
                'description': entity['description']  # Add description to node properties
            }
        }
        nodes.append(node)
    
    # Add edges (relationships) - Map source and target to entity IDs
    edges = []
    for _, rel in relationships.iterrows():
        source_id = rel['source']
        target_id = rel['target']
      
        mapped_source = entity_id_map.get(source_id, source_id) 
        mapped_target = entity_id_map.get(target_id, target_id) 
        
        # Check if mapped IDs exist in nodes
        if isinstance(mapped_source, str) and isinstance(mapped_target, str):
            if mapped_source in [node['id'] for node in nodes] and mapped_target in [node['id'] for node in nodes]:
                if mapped_source != mapped_target:
                    edge = {
                        'id': rel['id'],
                        'start': mapped_source,
                        'end': mapped_target,
                        'properties': {
                            'label': rel['description'],
                            'weight': rel['weight']
                        }
                    }
                    edges.append(edge)
                else:
                    print(f"Skipping edge {rel['id']} - Self-loop detected: Source={mapped_source}, Target={mapped_target}")
            else:
                source_title = get_title_for_id(mapped_source, entities, entity_id_map)
                target_title = get_title_for_id(mapped_target, entities, entity_id_map)
                print(f"Skipping edge {rel['id']} - Source or Target not found in nodes: Source={source_title}, Target={target_title}")
        else:
            print(f"Skipping edge {rel['id']} - Invalid mapped IDs: Source={mapped_source}, Target={mapped_target}")
    
    def node_styles_mapping(node):
        community_id = node['properties']['community']
        degree = node['properties']['degree']
        color = get_color(community_id, unique_communities) if community_id != -1 else '#808080'
        size = min(50, max(10, degree * 2))  # Scale size between 10 and 50 based on degree
        return {
            'color': color,
            'scaleFactor': size / 10,  # Adjust scale factor for visibility
            'label': node['properties']['label'],
            'tooltip': node['properties']['description'] 
        }
    
    # Set edge styles
    def edge_styles_mapping(edge):
        return {
            'color': '#000000',
            'label': edge['properties']['label']
        }
    
    w.nodes = nodes
    w.edges = edges
    w.node_styles_mapping = node_styles_mapping
    w.edge_styles_mapping = edge_styles_mapping
    w.directed = True
    
    # Set layout using organic layout
    w.organic_layout()
    
    return w

def visualize_community(community_id=208):  # Visualize community 208
    entities, relationships, community_mapping, entity_id_map = get_community_data(community_id)
    if entities is None:
        return
    
    graph_widget = create_graph(entities, relationships, community_mapping, entity_id_map)
    display(graph_widget)

# Run the visualization for community 208
# visualize_community(208)

In [None]:
# print entities in community 208
entities, relationships, community_mapping, entity_id_map = get_community_data(208)

In [28]:
visualize_community(208)

GraphWidget(layout=Layout(height='500px', width='100%'))