In [4]:
pip install python-louvain


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/muse-cleanned/ds_books_220_final.csv", names=["sub","rel","obj"])
df.head()

Unnamed: 0,sub,rel,obj
0,head,type,tail
1,Draco Malfoy,spouse,Astoria Greengrass
2,Astoria Greengrass,spouse,Draco Malfoy
3,Wizengamot,chairperson,Arthur
4,Lucius Malfoy,student,Arthur Weasley


In [5]:
#!/usr/bin/env python3
"""
Build an undirected graph from [subject, relation, object] triplets,
compute a variety of graph metrics, and save the graph to a pickle file.
"""

import pandas as pd
import networkx as nx
import pickle
import numpy as np
from collections import Counter

# ─── Community detection & modularity setup ─────────────────────────────────
try:
    # Preferred install: python-louvain
    import community.community_louvain as community_louvain
    _HAS_LOUVAIN = True
except ImportError:
    try:
        # Alternate package name
        import community_louvain as community_louvain
        _HAS_LOUVAIN = True
    except ImportError:
        community_louvain = None
        _HAS_LOUVAIN = False

if _HAS_LOUVAIN:
    detect_communities = community_louvain.best_partition
else:
    from networkx.algorithms.community import greedy_modularity_communities
    def detect_communities(G):
        """
        Returns a dict mapping node -> community_id,
        based on NetworkX greedy_modularity_communities.
        """
        comms = greedy_modularity_communities(G)
        mapping = {}
        for cid, community in enumerate(comms):
            for node in community:
                mapping[node] = cid
        return mapping

# ─── 1. Load CSV ─────────────────────────────────────────────────────────────
# Replace with your actual path if different:
df = pd.read_csv('/kaggle/input/muse-cleanned/ds_news_220_final.csv', names=['subject','relation','object'])
print("done reading")
# ─── 2. Build undirected graph ──────────────────────────────────────────────
G = nx.Graph()
for _, row in df.iterrows():
    s, r, o = row['subject'], row['relation'], row['object']
    G.add_node(s)
    G.add_node(o)
    G.add_edge(s, o, relation=r)

# ─── 3. Basic counts ─────────────────────────────────────────────────────────
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# ─── 4. Distribution of relation types ──────────────────────────────────────
relation_counts = Counter(data['relation'] for _, _, data in G.edges(data=True))

# ─── 5. Number of node types ────────────────────────────────────────────────
node_type_attr   = nx.get_node_attributes(G, 'type')  # if you've set G.nodes[n]['type']
node_type_counts = Counter(node_type_attr.values())
num_node_types   = len(node_type_counts)

# ─── 6. Degree metrics ──────────────────────────────────────────────────────
degrees = dict(G.degree())
degree_stats = {
    'mean': np.mean(list(degrees.values())),
    'max':  np.max(list(degrees.values())),
    'min':  np.min(list(degrees.values())),
}
avg_degree = degree_stats['mean']

# Person-only node degree (requires G.nodes[n]['type']=='Person')
person_nodes    = [n for n, d in G.nodes(data=True) if d.get('type') == 'Person']
person_degrees  = {n: G.degree(n) for n in person_nodes}

# ─── 7. Directed-style in/out degrees ────────────────────────────────────────
# In undirected graphs, in = out = degree
avg_out_degree = avg_degree
avg_in_degree  = avg_degree
print("done directed style")

# ─── 8. Coverage metrics ────────────────────────────────────────────────────
covered_nodes   = sum(1 for n in G if G.degree(n) > 0)
node_coverage   = covered_nodes / num_nodes if num_nodes else 0

components      = list(nx.connected_components(G))
giant_comp      = max(components, key=len) if components else set()
graph_coverage  = len(giant_comp) / num_nodes if num_nodes else 0
print("done coverage")

# ─── 9. Average hops (shortest path) ───────────────────────────────────────
if nx.is_connected(G):
    avg_hops = nx.average_shortest_path_length(G)
elif giant_comp:
    subG     = G.subgraph(giant_comp)
    avg_hops = nx.average_shortest_path_length(subG)
else:
    avg_hops = None

# ─── 10. Community detection → partition, communities list ─────────────────
partition     = detect_communities(G)      # dict: node -> comm_id
# build list-of-sets for NetworkX fallback
comm_sets     = {}
for node, cid in partition.items():
    comm_sets.setdefault(cid, set()).add(node)
communities   = list(comm_sets.values())
num_clusters  = len(communities)

# ─── 11. Modularity ─────────────────────────────────────────────────────────
if _HAS_LOUVAIN:
    # python-louvain expects (partition_dict, graph)
    modularity = community_louvain.modularity(partition, G)
else:
    # NetworkX modularity expects (graph, list_of_sets)
    from networkx.algorithms.community import modularity as nx_modularity
    modularity = nx_modularity(G, communities)
print("done moudulatrity")

# ─── 12. Conductance ────────────────────────────────────────────────────────
from networkx.algorithms.cuts import conductance
cond_values     = [conductance(G, c, set(G.nodes()) - c) for c in communities] if communities else []
avg_conductance = np.mean(cond_values) if cond_values else None
print("done conductance")

# ─── 13. Centrality & PageRank ───────────────────────────────────────────────
deg_centrality = nx.degree_centrality(G)
mean_cent      = np.mean(list(deg_centrality.values())) if deg_centrality else 0
max_cent       = np.max(list(deg_centrality.values())) if deg_centrality else 0
min_cent       = np.min(list(deg_centrality.values())) if deg_centrality else 0

pagerank       = nx.pagerank(G) if G.number_of_nodes() else {}
top_pr         = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
bottom_pr      = sorted(pagerank.items(), key=lambda x: x[1])[:10]

# ─── 14. Clustering/transitivity ────────────────────────────────────────────
transitivity   = nx.transitivity(G)
print("done transifitivty")

# ─── 15. Package up metrics ─────────────────────────────────────────────────
metrics = {
    'num_nodes':               num_nodes,
    'num_edges':               num_edges,
    'relation_distribution':   dict(relation_counts),
    'num_node_types':          num_node_types,
    'node_type_counts':        dict(node_type_counts),
    'degree_stats':            degree_stats,
    'avg_out_degree':          avg_out_degree,
    'avg_in_degree':           avg_in_degree,
    'person_only_degrees':     person_degrees,
    'node_coverage':           node_coverage,
    'graph_coverage':          graph_coverage,
    'avg_hops':                avg_hops,
    'num_clusters':            num_clusters,
    'modularity':              modularity,
    'avg_conductance':         avg_conductance,
    'mean_centrality':         mean_cent,
    'max_centrality':          max_cent,
    'min_centrality':          min_cent,
    'top_10_pagerank':         top_pr,
    'bottom_10_pagerank':      bottom_pr,
    'transitivity':            transitivity
}

import json
print(json.dumps(metrics, indent=2))

# ─── 16. Save graph to pickle ────────────────────────────────────────────────


done reading
done directed style
done coverage
done moudulatrity
done conductance
done transifitivty


TypeError: Object of type int64 is not JSON serializable

In [6]:
metrics

{'num_nodes': 424,
 'num_edges': 241,
 'relation_distribution': {'type': 1,
  'occupation': 4,
  'member of sports team': 10,
  'participant': 7,
  'located in the administrative territorial entity': 12,
  'employer': 6,
  'member of': 7,
  'director': 1,
  'member of political party': 4,
  'part of': 4,
  'publication date': 2,
  'point in time': 17,
  'has effect': 3,
  'chairperson': 1,
  'instance of': 23,
  'field of work': 1,
  'subclass of': 13,
  'location': 6,
  'owner of': 2,
  'presenter': 1,
  'elevation above sea level': 2,
  'mother': 3,
  'position held': 9,
  'date of death': 2,
  'participating team': 2,
  'start time': 2,
  'country': 15,
  'operator': 3,
  'subsidiary': 1,
  'winner': 2,
  'performer': 3,
  'parent organization': 2,
  'has part': 11,
  'applies to jurisdiction': 3,
  'contains administrative territorial entity': 2,
  'shares border with': 2,
  'diplomatic relation': 1,
  'headquarters location': 1,
  'spouse': 4,
  'owned by': 6,
  'twinned administr

In [7]:
with open('graph.pkl', 'wb') as f:
    pickle.dump(G, f)

print("✔ Graph and metrics computed; graph saved to graph.pkl")


✔ Graph and metrics computed; graph saved to graph.pkl
