In [None]:
import os
import pandas as pd
import jieba
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import logging
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from collections import defaultdict
from sentence_transformers import SentenceTransformer

plt.rcParams['font.sans-serif'] = ['SimHei'] # ‰∏≠ÊñáÊòæÁ§∫

# ÂÅúÁî®ËØçÔºàËá™ÂÆö‰πâÂíåÊñá‰ª∂Ôºâ
custom_stopwords = [
    "Ë±ÜÂåÖ", "DS", "‰∏ãËΩΩ", "ÊñáÂøÉ", "ÁôæÂ∫¶", "‰∏ÄË®Ä", "ÂÜÖÂÆπ", "‰∏úË•ø", "ÂÅ∑Á¨ë", "ÊÑüËßâ", "ÁúüÁöÑ", "Ê®°Âûã",
    "Â•≥Â•ó", "Áî∑Â•ó", "Áé´Áë∞", "ÊçÇËÑ∏", "ÂìàÂìà", "Êéí‰Ωç", "Ê≠£Ëµõ", "ÈáèÂ≠ê", "Áî∑Èûã", "Èõ∑ËØ∫", "Áî∑Ë£§", "Ê∞∏‰πÖ",
    "Âìà", "ÂìàÂìàÂìà", "ÂìàÂìàÂìàÂìà"
]
STOPWORDS_PATH = "stopwords1893.txt"

save_dir = "E:"
output_dir = os.path.join(save_dir, "sankey_outputs")
os.makedirs(output_dir, exist_ok=True)

platforms = ['dy', 'xhs', 'tieba']
comment_column = 'content'
time_column_map = {
    'dy': 'time',
    'xhs': 'time',
    'tieba': 'time'
}
optimal_k_dict = {'dy': 4, 'xhs': 6, 'tieba': 6}  # ‰∏ªÈ¢òÊï∞
time_freq = 'M'
min_docs_per_period = 20
similarity_threshold = 0.05

def load_chinese_stopwords(filepath, custom_list):
    stopwords = set(custom_list)
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                word = line.strip()
                if word:
                    stopwords.add(word)
    except FileNotFoundError:
        print(f"Ë≠¶Âëä: Ê†áÂáÜÂÅúÁî®ËØçÊñá‰ª∂Êú™ÊâæÂà∞: '{filepath}'„ÄÇ‰ªÖ‰ΩøÁî®Ëá™ÂÆö‰πâÂÅúÁî®ËØç„ÄÇ")
    except Exception as e:
        print(f"Âä†ËΩΩÂÅúÁî®ËØçÊñá‰ª∂ '{filepath}' Êó∂Âá∫Èîô: {e}„ÄÇ‰ªÖ‰ΩøÁî®Ëá™ÂÆö‰πâÂÅúÁî®ËØç„ÄÇ")
    return stopwords

def preprocess_text(text, stop_words_set):
    if not isinstance(text, str):
        return []
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"[^\u4e00-\u9fa5]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []
    words = jieba.lcut(text)
    # ËøáÊª§ÊéâÁ©∫Ê†ºÂíåÁ©∫Â≠óÁ¨¶‰∏≤
    words = [word for word in words if word and word.strip() and word not in stop_words_set]
    return words


def parse_time(time_str):
    if pd.isna(time_str):
        return pd.NaT
    if isinstance(time_str, (int, float)):
        try:
            if 631152000 < time_str < datetime.now().timestamp() + 31536000 * 5:
                return pd.to_datetime(time_str, unit='s')
            else:
                return pd.NaT
        except (ValueError, OSError):
            return pd.NaT
    if isinstance(time_str, str):
        time_str = time_str.strip()
        formats = [
            "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%Y-%m-%d %H:%M",
            "%Y/%m/%d %H:%M", "%Y-%m-%d", "%Y/%m/%d", "%YÂπ¥%mÊúà%dÊó• %H:%M",
            "%YÂπ¥%mÊúà%dÊó•", "%m-%d %H:%M", "%mÊúà%dÊó• %H:%M"
        ]
        for fmt in formats:
            try:
                if "%Y" not in fmt and ("%m-%d" in fmt or "%mÊúà%dÊó•" in fmt):
                    continue
                dt_obj = datetime.strptime(time_str, fmt)
                return dt_obj
            except ValueError:
                continue
        try:
            dt_obj = pd.to_datetime(time_str)
            return dt_obj
        except (ValueError, TypeError):
            return pd.NaT
    return pd.NaT

# --- JSD Áõ∏‰ººÂ∫¶ÂáΩÊï∞
def kl_divergence(p, q):
    p = np.asarray(p, dtype=np.float64)
    q = np.asarray(q, dtype=np.float64)
    epsilon = 1e-10
    p = np.where(p == 0, epsilon, p)
    q = np.where(q == 0, epsilon, q)
    division = np.divide(p, q, out=np.zeros_like(p), where=q!=0)
    log_division = np.log(np.where(division > 0, division, epsilon))
    kl_div = np.sum(p * log_division)
    return kl_div

def js_divergence(p, q):
    p = np.asarray(p, dtype=np.float64)
    q = np.asarray(q, dtype=np.float64)
    if len(p) != len(q):
        max_len = max(len(p), len(q))
        p = np.pad(p, (0, max_len - len(p)))
        q = np.pad(q, (0, max_len - len(q)))
    p_sum = np.sum(p)
    q_sum = np.sum(q)
    if p_sum > 1e-9: p /= p_sum
    else: p = np.ones_like(p) / len(p)
    if q_sum > 1e-9: q /= q_sum
    else: q = np.ones_like(q) / len(q)
    m = 0.5 * (p + q)
    jsd = 0.5 * (kl_divergence(p, m) + kl_divergence(q, m))
    jsd_bits = jsd / np.log(2)
    return jsd_bits

def get_topic_word_dist(topic_model, topic_id, combined_vocab):
    topic = topic_model.get_topic(topic_id)
    word_dist = np.full(len(combined_vocab), 1e-10)
    if topic:
        total_weight = sum([w for word, w in topic])
        if total_weight == 0:
            return word_dist
        for word, weight in topic:
            if word in combined_vocab:
                idx = combined_vocab.index(word)
                word_dist[idx] = weight / total_weight
    return word_dist

def calculate_topic_similarity_bertopic(model1, model2, k1, k2):
    # Ëé∑ÂèñÊâÄÊúâ‰∏ªÈ¢òÂá∫Áé∞ËøáÁöÑËØç
    vocab1 = set(word for tid in range(k1) for word, _ in (model1.get_topic(tid) or []))
    vocab2 = set(word for tid in range(k2) for word, _ in (model2.get_topic(tid) or []))
    combined_vocab = sorted(list(vocab1 | vocab2))
    similarity_matrix_js = np.zeros((k1, k2))
    for i in range(k1):
        dist1 = get_topic_word_dist(model1, i, combined_vocab)
        for j in range(k2):
            dist2 = get_topic_word_dist(model2, j, combined_vocab)
            jsd = js_divergence(dist1, dist2)
            similarity = 1.0 - jsd
            similarity_matrix_js[i, j] = max(0, min(1, similarity))
    return similarity_matrix_js

def plot_sankey(platform_name, sankey_data, all_nodes, period_labels, k_value, similarity_threshold, output_dir):
    if sankey_data.empty:
        print(f"Âπ≥Âè∞ {platform_name} Ê≤°ÊúâË∂≥Â§üÁöÑÊï∞ÊçÆÁªòÂà∂SankeyÂõæ„ÄÇ")
        return
    node_map = {node_label: i for i, node_label in enumerate(all_nodes)}
    topic_color_palette = px.colors.qualitative.Vivid
    if k_value > len(topic_color_palette):
        print(f"Ë≠¶Âëä: Âπ≥Âè∞ {platform_name} ÁöÑ K ÂÄº ({k_value}) Â§ß‰∫éÈ¢ÑÈÄâÈ¢úËâ≤ÊùøÂ§ßÂ∞è„ÄÇÈ¢úËâ≤Â∞Ü‰ºöÂæ™ÁéØ‰ΩøÁî®„ÄÇ")
    node_colors = []
    node_display_labels = []
    node_hover_labels = []
    for node_label in all_nodes:
        try:
            parts = node_label.split('_T')
            period = parts[0]
            topic_index = int(parts[-1])
            color = topic_color_palette[topic_index % len(topic_color_palette)]
            node_colors.append(color)
            node_display_labels.append(str(topic_index))
            node_hover_labels.append(f"Period: {period}<br>Topic: {topic_index}")
        except (ValueError, IndexError, KeyError):
            node_colors.append('grey')
            node_display_labels.append('?')
            node_hover_labels.append(node_label)
    link = dict(
        source=sankey_data['source_idx'].tolist(),
        target=sankey_data['target_idx'].tolist(),
        value=sankey_data['value'].tolist(),
        color='rgba(180, 180, 180, 0.35)',
        hovertemplate='Similarity: %{value:.3f}<extra></extra>'
    )
    node = dict(
        pad=12,
        thickness=18,
        line=dict(color="black", width=0.6),
        label=node_display_labels,
        color=node_colors,
        customdata=node_hover_labels,
        hovertemplate='<b>%{customdata}</b><extra></extra>',
    )
    fig = go.Figure()
    fig.add_trace(go.Sankey(
        arrangement='snap',
        node=node,
        link=link,
    ))
    fig.update_layout(
        title=dict(
            text=f"<b>{platform_name.upper()} Platform: Topic Evolution (K={k_value}, Min Similarity={similarity_threshold})</b>",
            font=dict(size=16, family="Arial, sans-serif", color='black'),
            x=0.5,
            xanchor='center'
        ),
        font=dict(size=11, family="Arial, sans-serif"),
        margin=dict(l=30, r=30, t=70, b=30),
        plot_bgcolor='white',
        height=600
    )
    fig.show()
    svg_filename = f"{platform_name}_k{k_value}_sim{similarity_threshold}_bertopic_evolution.svg"
    svg_filepath = os.path.join(output_dir, svg_filename)
    try:
        # print(f"  Ê≠£Âú®Â±ïÁ§∫ÂõæÂÉèÂà∞: {svg_filepath}")
        # fig.write_image(svg_filepath, format='svg', width=1200, height=600) #ÂèØËÉΩÂ≠òÂú®bug‰øùÂ≠ò‰∏ç‰∫Ü
        # fig.write_image(f"{platform_name}.png", width=1200, height=800)

        print(f"  ÊàêÂäü‰øùÂ≠ò SVG ÂõæÂÉè„ÄÇ")
    except (ImportError, ValueError) as e:
        print(f"\n[!] ÈîôËØØÔºöÊó†Ê≥ï‰øùÂ≠ò SVG ÂõæÂÉè„ÄÇ")
        print(f"    ËØ∑Á°Æ‰øùÂ∑≤ÂÆâË£Ö 'kaleido' ÂåÖ (pip install -U kaleido)„ÄÇ")
        print(f"    ÂÖ∑‰ΩìÈîôËØØ: {e}\n")
    except Exception as e:
        print(f"\n[!] ‰øùÂ≠ò SVG Êó∂ÂèëÁîüÊú™Áü•ÈîôËØØ: {e}\n")

if __name__ == "__main__":
    jieba.setLogLevel(logging.ERROR)
    print("Ê≠£Âú®Âä†ËΩΩÂÅúÁî®ËØç...")
    stop_words = load_chinese_stopwords(STOPWORDS_PATH, custom_stopwords)
    all_platforms_sankey_data = {}

    for platform in platforms:
        print(f"\n{'='*20} ÂºÄÂßãÂ§ÑÁêÜÂπ≥Âè∞: {platform} {'='*20}")
        file_path = os.path.join(save_dir, f"{platform}_combined.csv")
        time_col = time_column_map[platform]
        optimal_k = optimal_k_dict.get(platform)
        if optimal_k is None:
            print(f"ÈîôËØØ: Âπ≥Âè∞ {platform} Êú™Âú® optimal_k_dict ‰∏≠ÊâæÂà∞ÂØπÂ∫îÁöÑ K ÂÄº„ÄÇË∑≥ËøáÊ≠§Âπ≥Âè∞„ÄÇ")
            continue
        print(f"Âπ≥Âè∞ {platform} ‰ΩøÁî® K = {optimal_k}")

        # 1. Âä†ËΩΩÊï∞ÊçÆ
        try:
            print("Ê≠£Âú®Âä†ËΩΩÊï∞ÊçÆÊñá‰ª∂:")
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
            print(f"ÊàêÂäüÂä†ËΩΩÊï∞ÊçÆÔºåÂÖ± {len(df)} Êù°ËÆ∞ÂΩï„ÄÇ")
            if comment_column not in df.columns or time_col not in df.columns:
                print(f"ÈîôËØØ: Êñá‰ª∂ '{file_path}' Áº∫Â∞ëÂøÖÈúÄÁöÑÂàó ('{comment_column}' Êàñ '{time_col}')„ÄÇ")
                continue
            df.dropna(subset=[comment_column], inplace=True)
            print(f"ÁßªÈô§Á©∫ËØÑËÆ∫ÂêéÂâ©‰Ωô: {len(df)} Êù°ËÆ∞ÂΩï„ÄÇ")
            if df.empty: continue
            print(f"Ê≠£Âú®Ëß£ÊûêÊó∂Èó¥Âàó '{time_col}'...")
            original_time_count = len(df)
            df['datetime'] = df[time_col].apply(parse_time)
            df.dropna(subset=['datetime'], inplace=True)
            valid_time_count = len(df)
            print(f"Êó∂Èó¥Ëß£ÊûêÂÆåÊàê„ÄÇÊúâÊïàÊó∂Èó¥ËÆ∞ÂΩï: {valid_time_count} (ÁßªÈô§‰∫Ü {original_time_count - valid_time_count} Êù°Êó†ÊïàÊó∂Èó¥)")
            if df.empty:
                print(f"Âπ≥Âè∞ {platform} Ê≤°ÊúâÊúâÊïàÁöÑÂ∏¶Êó∂Èó¥Êà≥ÁöÑËØÑËÆ∫Êï∞ÊçÆ„ÄÇ")
                continue
            df.sort_values('datetime', inplace=True)
            df.reset_index(drop=True, inplace=True)
            # Êó∂Èó¥ËåÉÂõ¥ÈôêÂà∂
            start_date = pd.to_datetime("2024-02-01")
            end_date = pd.to_datetime("2025-04-01")
            df = df[(df['datetime'] >= start_date) & (df['datetime'] < end_date)].copy()
            if df.empty:
                print(f"Âπ≥Âè∞ {platform} Âú®ÊåáÂÆöÊó∂Èó¥ËåÉÂõ¥ ({start_date.date()} to {end_date.date()}) ÂÜÖÊ≤°ÊúâÊï∞ÊçÆÔºåË∑≥Ëøá„ÄÇ")
                continue
            print(f"Êó∂Èó¥ËåÉÂõ¥Á≠õÈÄâ ({start_date.date()} to {end_date.date()}) ÂêéÂâ©‰Ωô: {len(df)} Êù°ËÆ∞ÂΩï„ÄÇ")
        except FileNotFoundError:
            print(f"ÈîôËØØ: Êñá‰ª∂Êú™ÊâæÂà∞ '{file_path}'„ÄÇ")
            continue
        except pd.errors.EmptyDataError:
             print(f"ÈîôËØØ: Êñá‰ª∂ '{file_path}' ‰∏∫Á©∫„ÄÇ")
             continue
        except Exception as e:
            print(f"Âä†ËΩΩÊàñÂàùÊ≠•Â§ÑÁêÜÊñá‰ª∂ '{file_path}' Êó∂ÂèëÁîüÈîôËØØ: {e}„ÄÇ")
            continue

        # 2. ÊñáÊú¨È¢ÑÂ§ÑÁêÜ
        print("ÂºÄÂßãËøõË°åÊñáÊú¨È¢ÑÂ§ÑÁêÜ...")
        df['processed_text'] = df[comment_column].apply(lambda x: preprocess_text(x, stop_words))
        df = df[df['processed_text'].apply(len) > 0]
        print(f"ÊñáÊú¨È¢ÑÂ§ÑÁêÜÂÆåÊàê„ÄÇÊúâÊïàÊñáÊ°£Êï∞Èáè: {len(df)}")
        if df.empty:
             print(f"Âπ≥Âè∞ {platform} Âú®È¢ÑÂ§ÑÁêÜÂêéÊ≤°ÊúâÊúâÊïàÁöÑÊñáÊú¨Êï∞ÊçÆ„ÄÇ")
             continue

        # 3. ÊåâÊó∂Èó¥ÊÆµÂàÜÁªÑ
        print(f"ÊåâÊó∂Èó¥È¢ëÁéá '{time_freq}' ÂØπÊï∞ÊçÆËøõË°åÂàÜÁªÑ...")
        df['time_period'] = df['datetime'].dt.to_period(time_freq)
        grouped_data = df.groupby('time_period')
        valid_periods = {period: group for period, group in grouped_data if len(group) >= min_docs_per_period}
        print(f"ÂÖ±Êúâ {len(grouped_data)} ‰∏™ÂéüÂßãÊó∂Èó¥ÊÆµÔºåËøáÊª§ÂêéÂâ©‰Ωô {len(valid_periods)} ‰∏™ÊúâÊïàÊó∂Èó¥ÊÆµ (ÊñáÊ°£Êï∞ >= {min_docs_per_period})„ÄÇ")
        if len(valid_periods) < 2:
            print(f"Âπ≥Âè∞ {platform} ÁöÑÊúâÊïàÊó∂Èó¥ÊÆµÂ∞ë‰∫é 2 ‰∏™ÔºåÊó†Ê≥ïËøõË°åÊºîÂåñÂàÜÊûê„ÄÇ")
            continue

        # 4. ÊØèÊó∂Èó¥ÊÆµ BERTopic
        print(f"‰∏∫ÊØè‰∏™ÊúâÊïàÊó∂Èó¥ÊÆµËÆ≠ÁªÉ BERTopic Ê®°Âûã (K={optimal_k})...")
        period_models = {}
        sorted_periods = sorted(valid_periods.keys())
        for period in sorted_periods:
            period_df = valid_periods[period]
            print(f"\n====== Ë∞ÉËØïÔºöÊó∂Èó¥ÊÆµ {period} ======")
            # ÊâìÂç∞ period ÂéüÂßãËØÑËÆ∫Ê†∑‰æã
            print(f"{period} ÂéüÂßãËØÑËÆ∫Ââç5Êù°Ôºö")
            print(period_df[comment_column].head(5).tolist())
            
            # ÂàÜËØçÂâçÁöÑÊñáÊú¨Êï∞Èáè
            print(f"{period} period ÂéüÂßãËØÑËÆ∫Êï∞Èáè: {len(period_df)}")
            
            # ÂàÜËØçÔºåÂÖÅËÆ∏ÊâÄÊúâÈïøÂ∫¶ËØçÔºåÂπ∂ÊâìÂç∞ÈÉ®ÂàÜÂàÜËØçÁªìÊûú
            seg_list = period_df[comment_column].apply(lambda x: preprocess_text(x, stop_words)).tolist()
            seg_list = [words for words in seg_list if words and any([w.strip() for w in words])]
            valid_texts = [" ".join(words) for words in seg_list]
            print(f"{period} period ÂàÜËØçÂêéÊ†∑‰æãÂâç5Êù°Ôºö")
            print(seg_list[:5])
            
            # Ê£ÄÊü•ÂàÜËØçÂêéÊòØÂê¶ÈÉΩÊòØÁ©∫list
            empty_cnt = sum([1 for words in seg_list if not words])
            print(f"{period} period ÂàÜËØçÂêéÂÖ®Á©∫ÊñáÊú¨Êï∞: {empty_cnt}")
            
            # ÂàÜËØçÊãºÊé•ÂõûÂ≠óÁ¨¶‰∏≤ÔºåÂÅöÁªôvectorizerÁî®
            valid_texts = [" ".join(words) for words in seg_list if words]
            print(f"{period} ÂàÜËØçÊãºÊé•ÂêéÈùûÁ©∫Êù°Êï∞: {len(valid_texts)}")
            print(f"{period} ÂàÜËØçÊãºÊé•ÂêéÊ†∑‰æãÂâç5Êù°: {valid_texts[:5]}")
            
            # Â¶ÇÊûúÂÖ®ÈÉ®Á©∫ÔºåË∑≥ËøáÔºå‰∏çËÆ≠ÁªÉ
            if not valid_texts or all([not txt.strip() for txt in valid_texts]):
                print(f"  Ë∑≥ËøáÊó∂Èó¥ÊÆµ {period}: ÂàÜËØçÊãºÊé•ÂêéÊó†ÊúâÊïàÊñáÊú¨ÔºàÂÖ®Ë¢´ËøáÊª§ÊàñÂÖ®ÊòØÂÅúÁî®ËØçÔºâ")
                continue
            try:
                vectorizer_model = CountVectorizer(tokenizer=lambda x: x.split(), token_pattern=None)
                embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
                topic_model = BERTopic(
                    embedding_model=embedding_model,
                    nr_topics=optimal_k,
                    vectorizer_model=vectorizer_model,
                    calculate_probabilities=True,
                    verbose=False,
                    min_topic_size=5
                )
                topics, probs = topic_model.fit_transform(valid_texts)
                # ÊúâÊïà‰∏ªÈ¢òÊï∞
                actual_k = len([tid for tid in topic_model.get_topic_info()['Topic'] if tid != -1])
                if topic_model.get_topics():
                    period_models[period] = {
                        'model': topic_model,
                        'topics': topics,
                        'probs': probs,
                        'actual_k': actual_k
                    }
                    print(f"    Êó∂Èó¥ÊÆµ {period} ÁöÑBERTopicÊ®°ÂûãËÆ≠ÁªÉÂÆåÊàê„ÄÇ‰∏ªÈ¢òÊï∞: {actual_k}")
                else:
                    print(f"    Ë≠¶Âëä: {period} Ê≤°ÊúâÂ≠¶Âá∫ÊúâÊïà‰∏ªÈ¢ò„ÄÇ")
            except Exception as e:
                print(f"  Â§ÑÁêÜÊó∂Èó¥ÊÆµ {period} Êó∂Âá∫Èîô: {e}")
                if period in period_models: del period_models[period]

        # 5. ËÆ°ÁÆóÁõ∏ÈÇªÊó∂Èó¥ÊÆµ‰∏ªÈ¢òÁõ∏‰ººÂ∫¶
        print("ËÆ°ÁÆóÁõ∏ÈÇªÊó∂Èó¥ÊÆµÁöÑ‰∏ªÈ¢òÁõ∏‰ººÂ∫¶...")
        sankey_links = []
        valid_trained_periods = sorted([p for p in sorted_periods if p in period_models])
        if len(valid_trained_periods) < 2:
             print(f"Âπ≥Âè∞ {platform} ËÆ≠ÁªÉÊàêÂäüÁöÑÊ®°Âûã‰∏çË∂≥ 2 ‰∏™Êó∂Èó¥ÊÆµÔºåÊó†Ê≥ïËÆ°ÁÆóÊºîÂåñ„ÄÇ")
             continue
        for i in range(len(valid_trained_periods) - 1):
            period1_key = valid_trained_periods[i]
            period2_key = valid_trained_periods[i+1]
            print(f"  ÊØîËæÉ: {period1_key} -> {period2_key}")
            model1_data = period_models[period1_key]
            model2_data = period_models[period2_key]
            k1 = optimal_k
            k2 = optimal_k
            try:
                similarity_matrix = calculate_topic_similarity_bertopic(
                    model1_data['model'], model2_data['model'], k1, k2
                )
                for src_topic_idx in range(similarity_matrix.shape[0]):
                    for tgt_topic_idx in range(similarity_matrix.shape[1]):
                        similarity = similarity_matrix[src_topic_idx, tgt_topic_idx]
                        if similarity >= similarity_threshold:
                            source_node = f"{period1_key}_T{src_topic_idx}"
                            target_node = f"{period2_key}_T{tgt_topic_idx}"
                            sankey_links.append({
                                'source': source_node,
                                'target': target_node,
                                'value': similarity
                            })
            except Exception as e:
                print(f"    ËÆ°ÁÆó {period1_key} Âíå {period2_key} ‰πãÈó¥Áõ∏‰ººÂ∫¶Êó∂Âá∫Èîô: {e}")

        if not sankey_links:
             print(f"Âπ≥Âè∞ {platform} Ê≤°ÊúâËÆ°ÁÆóÂá∫È´ò‰∫éÈòàÂÄº {similarity_threshold} ÁöÑ‰∏ªÈ¢òÁõ∏‰ººÂ∫¶ÈìæÊé•„ÄÇÊó†Ê≥ïÁîüÊàêSankeyÂõæ„ÄÇ")
             continue

        # 6. ÂáÜÂ§á Sankey ÂõæÊï∞ÊçÆ
        sankey_df = pd.DataFrame(sankey_links)
        all_nodes = sorted(list(set(sankey_df['source'].tolist() + sankey_df['target'].tolist())),
                           key=lambda x: (str(x.split('_T')[0]), int(x.split('_T')[1])))
        period_labels_obj = sorted(list(set([pd.Period(node.split('_T')[0], freq=time_freq) for node in all_nodes])))
        period_labels_str = [str(p) for p in period_labels_obj]
        node_map = {node_label: i for i, node_label in enumerate(all_nodes)}
        sankey_df['source_idx'] = sankey_df['source'].map(node_map)
        sankey_df['target_idx'] = sankey_df['target'].map(node_map)
        all_platforms_sankey_data[platform] = {
            'sankey_df': sankey_df,
            'all_nodes': all_nodes,
            'period_labels': period_labels_str,
            'k_value': optimal_k
        }
        print(f"Âπ≥Âè∞ {platform} ÁöÑ Sankey Êï∞ÊçÆÂáÜÂ§áÂÆåÊàê„ÄÇÂÖ± {len(all_nodes)} ‰∏™ËäÇÁÇπÔºå{len(sankey_df)} ‰∏™ÈìæÊé•„ÄÇ")

    print(f"\n{'='*20} ÂºÄÂßãÁªòÂà∂ Sankey Âõæ {'='*20}")
    if not all_platforms_sankey_data:
        print("Ê≤°ÊúâÂèØÁî®‰∫éÁªòÂà∂ Sankey ÂõæÁöÑÊï∞ÊçÆ„ÄÇ")
    else:
        for platform, data in all_platforms_sankey_data.items():
            print(f"\nÊ≠£Âú®ÁªòÂà∂Âπ≥Âè∞ {platform} ÁöÑ Sankey Âõæ...")
            plot_sankey(platform,
                        data['sankey_df'],
                        data['all_nodes'],
                        data['period_labels'],
                        data['k_value'],
                        similarity_threshold,
                        output_dir)

    print("\nÊâÄÊúâÂπ≥Âè∞Â§ÑÁêÜÂÆåÊØï„ÄÇ")

Ê≠£Âú®Âä†ËΩΩÂÅúÁî®ËØç...

Âπ≥Âè∞ dy ‰ΩøÁî® K = 4
Ê≠£Âú®Âä†ËΩΩÊï∞ÊçÆÊñá‰ª∂:
ÊàêÂäüÂä†ËΩΩÊï∞ÊçÆÔºåÂÖ± 22431 Êù°ËÆ∞ÂΩï„ÄÇ
ÁßªÈô§Á©∫ËØÑËÆ∫ÂêéÂâ©‰Ωô: 22213 Êù°ËÆ∞ÂΩï„ÄÇ
Ê≠£Âú®Ëß£ÊûêÊó∂Èó¥Âàó 'time'...
Êó∂Èó¥Ëß£ÊûêÂÆåÊàê„ÄÇÊúâÊïàÊó∂Èó¥ËÆ∞ÂΩï: 22213 (ÁßªÈô§‰∫Ü 0 Êù°Êó†ÊïàÊó∂Èó¥)
Êó∂Èó¥ËåÉÂõ¥Á≠õÈÄâ (2024-02-01 to 2025-04-01) ÂêéÂâ©‰Ωô: 19800 Êù°ËÆ∞ÂΩï„ÄÇ
ÂºÄÂßãËøõË°åÊñáÊú¨È¢ÑÂ§ÑÁêÜ...
ÊñáÊú¨È¢ÑÂ§ÑÁêÜÂÆåÊàê„ÄÇÊúâÊïàÊñáÊ°£Êï∞Èáè: 17632
ÊåâÊó∂Èó¥È¢ëÁéá 'M' ÂØπÊï∞ÊçÆËøõË°åÂàÜÁªÑ...
ÂÖ±Êúâ 14 ‰∏™ÂéüÂßãÊó∂Èó¥ÊÆµÔºåËøáÊª§ÂêéÂâ©‰Ωô 14 ‰∏™ÊúâÊïàÊó∂Èó¥ÊÆµ (ÊñáÊ°£Êï∞ >= 20)„ÄÇ
‰∏∫ÊØè‰∏™ÊúâÊïàÊó∂Èó¥ÊÆµËÆ≠ÁªÉ BERTopic Ê®°Âûã (K=4)...

2024-02 ÂéüÂßãËØÑËÆ∫Ââç5Êù°Ôºö
['Ai‰∏çËÉΩÂíåÁ¨¨‰∫î‰∫∫Ê†ºÊØî[ÊçÇËÑ∏]', 'Âçé‰∏∫Ëá™Â∏¶ÁöÑÂú®ÊâãÊú∫ÈáåÂπ≤ÂòõÈùûÂæóÊêûËøô‰∏™', 'ÂÆÉËÉΩÁÆóÂá∫ÂèåËâ≤ÁêÉÂºÄÂ•ñÁªìÊûúÂêó[Âë≤Áâô][Âë≤Áâô][Âë≤Áâô]', 'ÂàöÂàöÊé•Âà∞ÈÄöÁü•ÔºåÊâãÊú∫Ë¢´Ê∑òÊ±∞‰∫Ü[ÂæÆÁ¨ë]', 'ËÉΩÁÆóÂá∫‰ªäÊôöÊéí‰æã‰∫îÁöÑÂºÄÂ•ñÁªìÊûúÂêóÔºü']
2024-02 period ÂéüÂßãËØÑËÆ∫Êï∞Èáè: 370
2024-02 period ÂàÜËØçÂêéÊ†∑‰æãÂâç5Êù°Ôºö
[['Á¨¨‰∫î', 

  ÊàêÂäü‰øùÂ≠ò SVG ÂõæÂÉè„ÄÇ

Ê≠£Âú®ÁªòÂà∂Âπ≥Âè∞ xhs ÁöÑ Sankey Âõæ...


  ÊàêÂäü‰øùÂ≠ò SVG ÂõæÂÉè„ÄÇ

Ê≠£Âú®ÁªòÂà∂Âπ≥Âè∞ tieba ÁöÑ Sankey Âõæ...


  ÊàêÂäü‰øùÂ≠ò SVG ÂõæÂÉè„ÄÇ

ÊâÄÊúâÂπ≥Âè∞Â§ÑÁêÜÂÆåÊØï„ÄÇ
