<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install  networkx==2.8 matplotlib==3.5.3
!pip install --upgrade --force-reinstall numpy pandas
import os
os.kill(os.getpid(), 9)
!pip install numpy==1.24.4 --force-reinstall
import os
os.kill(os.getpid(), 9)
!pip install arabic-reshaper python-bidi

In [None]:
!pip install numpy==1.24.4 --force-reinstall
import os
os.kill(os.getpid(), 9)


In [None]:

#!pip install --quiet pandas networkx matplotlib python-louvain openpyxl arabic-reshaper python-bidi

# ---- Import libraries ----
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import community.community_louvain as community_louvain
from google.colab import files
import ast
import arabic_reshaper
from bidi.algorithm import get_display
import matplotlib.font_manager as fm

# ---- Upload font file ----
print("Please upload the B-Nazanin.ttf font file")
uploaded_font = files.upload()
font_path = next(iter(uploaded_font))  # filename
font_prop = fm.FontProperties(fname=font_path)

# ---- Upload Excel file ----
print("Please upload the Excel file containing tokenized tweets in the second column...")
uploaded = files.upload()

# ---- Read and parse data ----
df = pd.read_excel(next(iter(uploaded)))
texts = df.iloc[:, 1].astype(str).apply(ast.literal_eval).tolist()
texts = [doc for doc in texts if isinstance(doc, list) and len(doc) > 0]

# ---- Helper function to verify token format ----
def assert_tokenized_lists(texts):
    for idx, item in enumerate(texts):
        if not isinstance(item, list):
            raise ValueError(f"Row {idx} is not a list: {item}")
        if not all(isinstance(token, str) for token in item):
            raise ValueError(f"Row {idx} contains non-string tokens: {item}")
    print("✅ All rows are valid lists of string tokens.")

assert_tokenized_lists(texts)

# ---- Function to build word co-occurrence graph ----
def construct_word_cooccurrence_graph(dataset, window_size=5):
    G = nx.Graph()
    for tokens in dataset:
        for i, word in enumerate(tokens):
            for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
                if i != j:
                    u, v = word, tokens[j]
                    if G.has_edge(u, v):
                        G[u][v]['weight'] += 1
                    else:
                        G.add_edge(u, v, weight=1)
    return G

# ---- Graph pruning function ----
def prune_graph(graph, min_frequency=5, max_connections_per_node=30):
    G = graph.copy()
    for u, v, data in list(G.edges(data=True)):
        if data['weight'] < min_frequency:
            G.remove_edge(u, v)
    low_degree_nodes = [n for n, d in G.degree() if d < min_frequency]
    G.remove_nodes_from(low_degree_nodes)
    if len(G.nodes) == 0:
        print("⚠️ Graph is empty after pruning.")
        return G
    G = nx.k_core(G, k=min(3, max_connections_per_node))
    return G

# ---- Community detection ----
def detect_communities(graph, resolution=1.0):
    if nx.is_empty(graph):
        return {}
    return community_louvain.best_partition(graph, resolution=resolution)

# ---- Graph visualization ----
def visualize_graph(graph, partition, font_prop):
    if not partition:
        print("⚠️ Empty graph or no communities found.")
        return

    pos = nx.spring_layout(graph, seed=42)
    cmap = plt.get_cmap('viridis', max(partition.values()) + 1)

    plt.figure(figsize=(15, 10))
    nx.draw(
        graph, pos,
        node_color=[partition[n] for n in graph.nodes()],
        node_size=80, cmap=cmap,
        with_labels=True,
        labels={n: get_display(arabic_reshaper.reshape(n)) for n in graph.nodes()},
        font_family=font_prop.get_name(),
        font_size=9,
        edge_color="lightgray", alpha=0.7
    )
    plt.title("Word Co-occurrence Graph with Communities", fontproperties=font_prop, fontsize=16)
    plt.savefig("community_graph.png", dpi=300, bbox_inches='tight')
    plt.show()

    # دانلود خودکار فایل PNG
    files.download("community_graph.png")

# ---- Extract community keywords ----
def identify_community_keywords(graph, partition, top_n=10):
    communities = {}
    for node, comm in partition.items():
        communities.setdefault(comm, []).append(node)
    results = {}
    for comm, nodes in communities.items():
        subgraph = graph.subgraph(nodes)
        centrality = nx.degree_centrality(subgraph)
        ranked = sorted(nodes, key=lambda x: -centrality.get(x, 0))
        results[comm] = ranked[:top_n]
    return results

# ---- Find optimal resolution ----
def find_optimal_resolution(graph, target_num_communities=10, start_resolution=0.1, step=0.1, max_iterations=50):
    resolution = start_resolution
    for _ in range(max_iterations):
        partition = detect_communities(graph, resolution)
        num_comms = len(set(partition.values()))
        print(f"Resolution {resolution:.2f} → {num_comms} communities")
        if num_comms <= target_num_communities:
            return partition, resolution
        resolution += step
    print("⚠️ Could not find optimal resolution within limits.")
    return {}, resolution

# ---- Main Execution ----
word_graph = construct_word_cooccurrence_graph(texts)
word_graph.remove_edges_from(nx.selfloop_edges(word_graph))
pruned_graph = prune_graph(word_graph)

if len(pruned_graph.nodes) == 0:
    print("❌ No usable graph after pruning. Try adjusting thresholds.")
else:
    partition, res = find_optimal_resolution(pruned_graph, target_num_communities=10)
    if partition:
        visualize_graph(pruned_graph, partition, font_prop)

        keywords = identify_community_keywords(pruned_graph, partition)
        for comm_id, words in keywords.items():
            print(f"🔹 Community {comm_id}: {words}")
    else:
        print("❌ No communities detected.")


In [None]:
# 📌 Display central words based on Degree and Betweenness Centrality (without arabic_reshaper)

def display_top_words_by_centrality(graph, top_n=30):
    if nx.is_empty(graph):
        print("⚠️ Graph is empty.")
        return

    # Calculate centralities
    degree_centrality = nx.degree_centrality(graph)
    betweenness_centrality = nx.betweenness_centrality(graph)

    # Combine and sort
    combined = [
        (node, degree_centrality[node], betweenness_centrality[node])
        for node in graph.nodes()
    ]
    combined_sorted = sorted(combined, key=lambda x: (-x[1], -x[2]))[:top_n]

    print("📊 Top Words by Centrality:")
    for word, deg, btw in combined_sorted:
        print(f"{word}: Degree={deg:.4f}, Betweenness={btw:.4f}")

# 🟢 Run
display_top_words_by_centrality(pruned_graph)

