In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os

# --- 用户配置 ---
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']

save_dir = "E:"
platforms = ['dy', 'xhs', 'tieba']
comment_column = 'content'
sentiment_column = 'sentiment'
start_date_str = '2024-02-01'
end_date_str = '2025-3-31'
agg_freq = '2D'
output_plot_filename = os.path.join(save_dir, "sentiment_evolution_plot.svg")

# --- 字体设置 ---
try:
    plt.rcParams['font.sans-serif'] = ['SimHei']
except:
    try:
        plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
    except:
        print("警告：未找到中文字体。")
plt.rcParams['axes.unicode_minus'] = False

# --- 情感分数映射 ---
sentiment_mapping = {
    'Very Positive': 0.5,
    'Positive': 0.25,
    'Neutral': 0,
    'Negative': -0.25,
    'Very Negative': -0.5
}

# --- 平台配置 ---
platform_configs = {
    'dy':    {'time_col': 'create_time', 'like_col': 'like_count'},
    'xhs':   {'time_col': 'time', 'like_col': 'like_count'},
    'tieba': {'time_col': 'publish_time', 'like_col': None}
}

# --- 时间范围 ---
try:
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)
except ValueError:
    print(f"错误：开始或结束日期格式不正确。")
    exit()

all_platform_sentiment = {}

# --- 数据处理 ---
print("开始处理数据...")
for platform in platforms:
    print(f"--- 处理平台: {platform} ---")
    config = platform_configs[platform]
    time_col = config['time_col']
    like_col = config['like_col']
    file_path = os.path.join(save_dir, f"{platform}_combined.csv")
    print(f"读取文件: {file_path}")

    try:
        use_cols = [time_col, sentiment_column]
        if like_col:
            use_cols.append(like_col)

        if not os.path.exists(file_path):
            print(f"警告: 文件未找到 {file_path}。跳过。")
            continue

        df = pd.read_csv(file_path, usecols=use_cols, low_memory=False)
        print(f"原始数据行数: {len(df)}")

        # --- 时间列清洗 ---
        print(f"处理时间列: {time_col}")
        if not pd.api.types.is_string_dtype(df[time_col]) and not pd.api.types.is_object_dtype(df[time_col]):
            df[time_col] = df[time_col].astype(str)
        df[time_col] = df[time_col].fillna('')
        df[time_col] = df[time_col].str.strip()
        df[time_col] = df[time_col].str.replace(r'\s+', ' ', regex=True)
        df[time_col] = df[time_col].str.strip()
        cleaned_time_col = df[time_col].copy()

        # --- 时间格式转换 ---
        datetime_format = None
        try:
            if platform == 'dy':
                datetime_format = '%Y-%m-%d %H:%M:%S'
                df[time_col] = pd.to_datetime(cleaned_time_col, format=datetime_format, errors='coerce')
            elif platform == 'xhs':
                df[time_col] = pd.to_datetime(cleaned_time_col, errors='coerce')
            else:
                df[time_col] = pd.to_datetime(cleaned_time_col, errors='coerce')
        except ValueError as e:
            print(f"解析时间错误: {e}")
            df[time_col] = pd.NaT

        invalid_time_mask = df[time_col].isna()
        num_total_invalid = invalid_time_mask.sum()
        print(f"无效时间值: {num_total_invalid}")

        df.dropna(subset=[time_col], inplace=True)
        print(f"有效时间数据行数: {len(df)}")

        if not df.empty:
            df[time_col] = df[time_col].dt.normalize()
        else:
            print(f"警告：平台 {platform} 时间清洗后无数据。")

        # --- 情感分数处理 ---
        print(f"处理情感列: {sentiment_column}")
        df['sentiment_score'] = df[sentiment_column].map(sentiment_mapping)
        df.dropna(subset=['sentiment_score'], inplace=True)
        if df.empty:
            print(f"警告: 平台 {platform} 映射情感分数后无数据。")
            continue

        # --- 加权情感分数 ---
        print("计算加权情感分数...")
        if like_col:
            df[like_col] = pd.to_numeric(df[like_col], errors='coerce').fillna(0)
            df[like_col] = df[like_col].clip(lower=0)
            df['weighted_sentiment'] = df['sentiment_score'] * np.log1p(df[like_col])
            df['weighted_sentiment'] = df['weighted_sentiment'].clip(lower=-0.35, upper=0.35)
        else:
            df['weighted_sentiment'] = df['sentiment_score'].clip(lower=-0.35, upper=0.35)

        # --- 按时间聚合 ---
        print(f"按时间聚合 (频率: {agg_freq})...")
        if df.empty or 'weighted_sentiment' not in df.columns:
            print(f"警告: 平台 {platform} 无有效加权情感数据。")
            continue

        df.set_index(time_col, inplace=True)
        sentiment_ts = df['weighted_sentiment'].resample(agg_freq).mean()
        if sentiment_ts.empty:
            print(f"警告: 平台 {platform} 重采样后结果为空。")
            continue

        sentiment_ts.ffill(inplace=True)
        sentiment_ts.bfill(inplace=True)
        if sentiment_ts.isna().all():
            print(f"警告: 平台 {platform} 聚合后全部为 NaN。")
            continue

        all_platform_sentiment[platform] = sentiment_ts
        print(f"平台 {platform} 处理完成。聚合后时间点: {len(sentiment_ts)}")

    except FileNotFoundError:
        print(f"错误: 文件未找到 {file_path}")
    except KeyError as e:
        print(f"错误: 列名错误 - {e}。")
    except Exception as e:
        print(f"错误: 处理平台 {platform} 时发生错误: {e}")
        import traceback
        traceback.print_exc()

# --- 绘图 ---
print("\n开始绘制情感演变图...")
if not all_platform_sentiment:
    print("错误：无有效数据，无法绘图。")
else:
    plt.figure(figsize=(18, 9))
    colors = ['#1f77b4', '#ff7f0e', 'green']
    color_map = {platform: colors[i] for i, platform in enumerate(all_platform_sentiment.keys())}

    for platform, sentiment_ts in all_platform_sentiment.items():
        if not sentiment_ts.empty:
            rolling_mean_ts = sentiment_ts.rolling(window=7, center=True, min_periods=1).mean()
            plt.plot(rolling_mean_ts.index, rolling_mean_ts.values,
                     label=f"{platform.upper()} 平台",
                     color=color_map[platform],
                     linewidth=2,
                     linestyle='-')
        else:
            print(f"警告: 平台 {platform} 无聚合数据可绘制。")

    plt.xlim(start_date, end_date)
    plt.title(f'各平台对国产大模型情感演变趋势', fontsize=18, pad=20)
    plt.xlabel('日期', fontsize=14, labelpad=10)
    plt.ylabel('平均加权情感得分', fontsize=14, labelpad=10)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

    if agg_freq == 'D':
        total_days = (end_date - start_date).days
        if total_days > 180:
            plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
            plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
        elif total_days > 60:
            plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
            plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
        else:
            plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
            plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=1))
    elif agg_freq == 'W':
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
        plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
    else:
        plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(minticks=8, maxticks=15))

    plt.xticks(rotation=30, ha='right', fontsize=11)
    plt.yticks(fontsize=11)
    plt.axhline(0, color='grey', linestyle='--', linewidth=0.8, alpha=0.7)
    plt.legend(title="平台", fontsize=12, title_fontsize=13, loc='upper left', bbox_to_anchor=(1.02, 1))
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout(rect=[0, 0, 0.9, 1])

    try:
        plt.savefig(output_plot_filename, dpi=300, bbox_inches='tight')
        print(f"图像已保存至: {output_plot_filename}")
    except Exception as e:
        print(f"错误: 保存图像失败 - {e}")

    plt.show()

print("\n处理完成。")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
from datetime import timedelta
import warnings
import math
import traceback 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

# --- 用户配置 ---
try:
    plt.rcParams['font.sans-serif'] = ['SimHei']
except:
    try:
        plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
    except:
        print("警告：未找到中文字体。")
plt.rcParams['axes.unicode_minus'] = False

save_dir = "E"
platforms = ['dy', 'xhs', 'tieba']
sentiment_column = 'sentiment'
start_date_str = '2024-02-01'
end_date_str = '2025-04-01'
agg_freq = '2D'
forecast_months = 2
forecast_output_dir = os.path.join(save_dir, f"forecast_results_transformer_lstm_{forecast_months}mo")
time_step = 15
hidden_size = 64
encoder_layers = 3
lstm_layers = 1
dropout = 0.1
epochs = 60
batch_size = 16
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if not os.path.exists(forecast_output_dir):
    os.makedirs(forecast_output_dir)
    print(f"已创建目录: {forecast_output_dir}")

torch.manual_seed(42)
np.random.seed(42)
warnings.filterwarnings("ignore")

print("\n--- 开始加载和预处理数据 ---")

sentiment_mapping = {
    'Very Positive': 0.5,
    'Positive': 0.25,
    'Neutral': 0,
    'Negative': -0.25,
    'Very Negative': -0.5
}

platform_configs = {
    'dy':    {'time_col': 'create_time', 'like_col': 'like_count'},
    'xhs':   {'time_col': 'time', 'like_col': 'like_count'},
    'tieba': {'time_col': 'publish_time', 'like_col': None}
}

try:
    start_date_filter = pd.to_datetime(start_date_str)
    end_date_filter = pd.to_datetime(end_date_str)
    print(f"原始数据过滤范围: {start_date_filter.date()} 至 {end_date_filter.date()}")
except ValueError:
    print(f"错误：日期格式不正确。")
    exit()

all_platform_sentiment = {}

for platform in platforms:
    print(f"\n--- 处理平台: {platform} ---")
    config = platform_configs[platform]
    time_col = config['time_col']
    like_col = config['like_col']
    file_path = os.path.join(save_dir, f"{platform}_combined.csv")
    print(f"读取文件: {file_path}")

    try:
        use_cols = [time_col, sentiment_column]
        if like_col:
            use_cols.append(like_col)

        if not os.path.exists(file_path):
            print(f"警告: 文件未找到 {file_path}。跳过。")
            continue

        df = pd.read_csv(file_path, usecols=use_cols, low_memory=False)
        print(f"原始数据行数: {len(df)}")

        if not pd.api.types.is_string_dtype(df[time_col]) and not pd.api.types.is_object_dtype(df[time_col]):
            df[time_col] = df[time_col].astype(str)
        df[time_col] = df[time_col].fillna('')
        try:
            df[time_col] = df[time_col].str.strip()
            if pd.api.types.is_string_dtype(df[time_col]):
                df[time_col] = df[time_col].str.replace(r'\s+', ' ', regex=True)
                df[time_col] = df[time_col].str.strip()
        except AttributeError:
            print(f"警告: 无法清理时间列。")

        cleaned_time_col = df[time_col].copy()
        try:
            if platform == 'dy':
                df[time_col] = pd.to_datetime(cleaned_time_col, format='%Y-%m-%d %H:%M:%S', errors='coerce')
            elif platform == 'xhs':
                df[time_col] = pd.to_datetime(cleaned_time_col, errors='coerce')
            else:
                df[time_col] = pd.to_datetime(cleaned_time_col, errors='coerce')
        except ValueError as e:
            print(f"错误: 时间解析失败: {e}")
            df[time_col] = pd.NaT

        invalid_time_mask = df[time_col].isna()
        num_total_invalid = invalid_time_mask.sum()
        if num_total_invalid > 0:
            print(f"无效时间值: {num_total_invalid}")

        original_rows = len(df)
        df.dropna(subset=[time_col], inplace=True)
        removed_count = original_rows - len(df)
        if removed_count > 0:
            print(f"删除无效时间行: {removed_count}")
        print(f"有效时间数据行: {len(df)}")
        if df.empty:
            print(f"警告: 平台 {platform} 时间清洗后无数据。")
            continue

        df = df[(df[time_col] >= start_date_filter) & (df[time_col] <= end_date_filter)]
        print(f"日期范围内数据行: {len(df)}")
        if df.empty:
            print(f"警告: 平台 {platform} 日期范围内无数据。")
            continue

        df['sentiment_score'] = df[sentiment_column].map(sentiment_mapping)
        original_rows = len(df)
        df.dropna(subset=['sentiment_score'], inplace=True)
        removed_count = original_rows - len(df)
        print(f"情感有效数据行: {len(df)}")
        if df.empty:
            print(f"警告: 平台 {platform} 映射情感分数后无数据。")
            continue

        if like_col:
            df[like_col] = pd.to_numeric(df[like_col], errors='coerce')
            df[like_col] = df[like_col].fillna(0)
            neg_likes_mask = df[like_col] < 0
            if neg_likes_mask.any():
                df.loc[neg_likes_mask, like_col] = 0
            df['weight'] = np.log1p(df[like_col])
            df['weighted_sum_component'] = df['sentiment_score'] * df['weight']
        else:
            df['weight'] = 1.0
            df['weighted_sum_component'] = df['sentiment_score'] * df['weight']

        print(f"按时间聚合 (频率: {agg_freq})...")
        if df.empty or 'weighted_sum_component' not in df.columns or 'weight' not in df.columns:
            print(f"警告: 平台 {platform} 无有效加权情感数据。")
            continue

        if df.index.name != time_col:
            if not pd.api.types.is_datetime64_any_dtype(df[time_col]):
                print(f"警告: 时间列类型不对，尝试转换。")
                df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
                df.dropna(subset=[time_col], inplace=True)
            if df.empty:
                print(f"警告: 平台 {platform} 时间列转换后无数据。")
                continue
            df.set_index(time_col, inplace=True)

        resampled_data = df.resample(agg_freq).agg(
            weighted_sum=('weighted_sum_component', 'sum'),
            total_weight=('weight', 'sum')
        )

        sentiment_ts = resampled_data['weighted_sum'].copy()
        non_zero_weight_mask = resampled_data['total_weight'] != 0
        sentiment_ts.loc[non_zero_weight_mask] /= resampled_data.loc[non_zero_weight_mask, 'total_weight']
        sentiment_ts.loc[~non_zero_weight_mask] = np.nan

        sentiment_ts.ffill(inplace=True)
        sentiment_ts.bfill(inplace=True)
        final_nan_count = sentiment_ts.isna().sum()
        if final_nan_count > 0:
            print(f"警告: 填充后仍有 NaN: {final_nan_count}")

        sentiment_ts = sentiment_ts.clip(lower=-0.35, upper=0.35)

        if sentiment_ts.isna().all() or sentiment_ts.empty:
            print(f"警告: 平台 {platform} 聚合后无有效数据。")
            continue

        all_platform_sentiment[platform] = sentiment_ts
        print(f"平台 {platform} 处理完成。聚合时间点: {len(sentiment_ts)}")
        print(f"时间范围: {sentiment_ts.index.min().date()} 至 {sentiment_ts.index.max().date()}")

    except FileNotFoundError:
        print(f"错误: 文件未找到 {file_path}")
    except KeyError as e:
        print(f"错误: 列名错误 - {e}")
        traceback.print_exc()
    except Exception as e:
        print(f"错误: 发生意外错误: {e}")
        traceback.print_exc()

class SentimentDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return max(0, len(self.data) - self.seq_length)

    def __getitem__(self, index):
        input_seq = self.data[index : index + self.seq_length]
        target_val = self.data[index + self.seq_length]
        return torch.tensor(input_seq, dtype=torch.float32), torch.tensor(target_val, dtype=torch.float32)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 != 0:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        pos_encoding = self.pe[:x.size(0), :].unsqueeze(1)
        return x + pos_encoding.to(x.device)

class TransformerLSTM(nn.Module):
    def __init__(self, input_feature_size=1, hidden_size=64, num_encoder_layers=3, num_lstm_layers=1, dropout=0.1, nhead=8):
        super(TransformerLSTM, self).__init__()
        self.input_linear = nn.Linear(input_feature_size, hidden_size)
        self.pos_encoder = PositionalEncoding(hidden_size, max_len=time_step + 50)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead, dim_feedforward=hidden_size*4, dropout=dropout, batch_first=False)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_lstm_layers, dropout=dropout if num_lstm_layers > 1 else 0, batch_first=False)
        self.output_linear = nn.Linear(hidden_size, 1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.output_linear.bias.data.zero_()
        self.output_linear.weight.data.uniform_(-initrange, initrange)

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src):
        src = src.permute(1, 0, 2)
        src = self.input_linear(src)
        src = self.pos_encoder(src)
        mask = self._generate_square_subsequent_mask(src.size(0)).to(src.device)
        transformer_output = self.transformer_encoder(src, mask=mask)
        lstm_output, (hn, cn) = self.lstm(transformer_output)
        last_hidden_state = hn[-1, :, :]
        prediction = self.output_linear(last_hidden_state)
        return prediction

def train_model(model, train_loader, optimizer, criterion, device, epoch, total_epochs):
    model.train()
    epoch_loss = 0.0
    pbar = tqdm(train_loader, desc=f"训练 Epoch {epoch+1}/{total_epochs}", leave=False)
    for batch_idx, (input_seq, target_val) in enumerate(pbar):
        input_seq = input_seq.unsqueeze(-1).to(device)
        target_val = target_val.unsqueeze(-1).to(device)
        optimizer.zero_grad()
        output = model(input_seq)
        loss = criterion(output, target_val)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        pbar.set_postfix({'批次损失': f'{loss.item():.4f}'})
    avg_epoch_loss = epoch_loss / len(train_loader)
    return avg_epoch_loss

def predict_future(model, initial_sequence, steps, scaler, device):
    model.eval()
    predictions_scaled = []
    current_sequence = initial_sequence.clone()
    with torch.no_grad():
        for _ in range(steps):
            input_tensor = current_sequence.unsqueeze(0).unsqueeze(-1).to(device)
            next_pred_scaled = model(input_tensor)
            pred_val = next_pred_scaled.item()
            predictions_scaled.append(pred_val)
            new_sequence_member = torch.tensor([pred_val], dtype=torch.float32, device=device)
            current_sequence = torch.cat((current_sequence[1:], new_sequence_member), dim=0)
    predictions_scaled_np = np.array(predictions_scaled).reshape(-1, 1)
    predictions_unscaled = scaler.inverse_transform(predictions_scaled_np)
    return predictions_unscaled.flatten()

all_platform_forecasts = {}
all_platform_scalers = {}
all_platform_histories = {}

print(f"\n--- 开始训练并预测 ---")
print(f"--- 使用设备: {device} ---")

if not all_platform_sentiment:
    print("\n错误：无数据，无法训练和预测。")
    exit()

platform_pbar = tqdm(all_platform_sentiment.items(), desc="处理平台")
for platform, ts_data in platform_pbar:
    platform_pbar.set_postfix({'当前平台': platform.upper()})
    if len(ts_data) < time_step + 1:
        print(f"平台 {platform} 数据量不足，跳过。")
        continue

    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(ts_data.values.reshape(-1, 1)).flatten()
    all_platform_scalers[platform] = scaler
    all_platform_histories[platform] = ts_data

    full_dataset = SentimentDataset(data_scaled, time_step)
    if len(full_dataset) == 0:
        print(f"警告: 平台 {platform} 数据量不足，跳过。")
        continue

    train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    print(f"平台 {platform}: 训练样本数 {len(full_dataset)}")

    model = TransformerLSTM(input_feature_size=1,
                            hidden_size=hidden_size,
                            num_encoder_layers=encoder_layers,
                            num_lstm_layers=lstm_layers,
                            dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    print(f"开始训练 {platform} ({epochs} epochs)...")
    train_losses = []
    for epoch in range(epochs):
        epoch_loss = train_model(model, train_loader, optimizer, criterion, device, epoch, epochs)
        train_losses.append(epoch_loss)
        if (epoch + 1) % 10 == 0 or epoch == epochs - 1:
            print(f"{platform} - Epoch {epoch+1}/{epochs}, 损失: {epoch_loss:.6f}")

    print(f"模型训练完成，开始预测 {platform} ...")
    last_sequence_scaled = data_scaled[-time_step:]
    initial_input_tensor = torch.tensor(last_sequence_scaled, dtype=torch.float32).to(device)

    last_date = ts_data.index[-1]
    forecast_end_date = last_date + pd.DateOffset(months=forecast_months)
    next_pred_start_date = last_date + pd.Timedelta(agg_freq)
    future_index = pd.date_range(start=next_pred_start_date, end=forecast_end_date, freq=agg_freq)
    num_forecast_steps = len(future_index)

    if num_forecast_steps <= 0:
        print(f"警告: 平台 {platform} 预测步数为 0，跳过。")
        continue

    print(f"预测未来 {num_forecast_steps} 步 ({agg_freq}), {future_index[0].date()} 到 {future_index[-1].date()}...")

    forecast_values_unscaled = predict_future(model, initial_input_tensor, num_forecast_steps, scaler, device)
    forecast_series = pd.Series(forecast_values_unscaled, index=future_index)
    all_platform_forecasts[platform] = forecast_series
    print(f"平台 {platform} 预测完成。")

print("\n--- 绘制预测图 ---")

if not all_platform_forecasts and not all_platform_histories:
    print("错误：无历史或预测结果可绘制。")
else:
    plt.figure(figsize=(18, 9))
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    platform_keys = list(all_platform_histories.keys())
    platform_color_map = {platform: colors[i % len(colors)] for i, platform in enumerate(platform_keys)}

    print("绘制历史数据...")
    plotted_history = False
    for platform, history_ts in all_platform_histories.items():
        if not history_ts.empty:
            plt.plot(history_ts.index, history_ts.values,
                     label=f'{platform.upper()} 历史情感',
                     color=platform_color_map.get(platform, 'gray'),
                     linewidth=1.5, alpha=0.8)
            plotted_history = True
        else:
            print(f"警告: 平台 {platform} 无历史数据。")
    if not plotted_history: print("警告：无历史数据被绘制。")

    print("绘制预测数据...")
    plot_end_date = None
    plotted_forecast = False
    for platform, forecast_ts in all_platform_forecasts.items():
        if not forecast_ts.empty:
            plt.plot(forecast_ts.index, forecast_ts.values,
                     label=f'{platform.upper()} 预测情感',
                     color=platform_color_map.get(platform, 'black'),
                     linestyle='--', marker='o', markersize=4, linewidth=2.0)
            current_max_date = forecast_ts.index[-1]
            if plot_end_date is None or current_max_date > plot_end_date:
                plot_end_date = current_max_date
            plotted_forecast = True
        else:
            if platform in all_platform_histories:
                print(f"警告: 平台 {platform} 无预测数据。")
    if not plotted_forecast: print("警告：无预测数据被绘制。")

    plt.title(f'各平台情感演变与未来 {forecast_months} 个月预测 (Transformer-LSTM)', fontsize=18, pad=20)
    plt.xlabel('日期', fontsize=14, labelpad=10)
    plt.ylabel('平均加权情感得分', fontsize=14, labelpad=10)
    plt.axhline(0, color='grey', linestyle=':', linewidth=1.0, alpha=0.8)

    try:
        plot_start_date = min(ts.index[0] for ts in all_platform_histories.values() if not ts.empty)
    except ValueError:
        plot_start_date = pd.to_datetime(start_date_str)

    if plot_end_date is None:
        try:
            plot_end_date = max(ts.index[-1] for ts in all_platform_histories.values() if not ts.empty)
        except ValueError:
            plot_end_date = pd.to_datetime(end_date_str)
    plot_end_date += timedelta(days=5)

    plt.xlim(plot_start_date, plot_end_date)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(minticks=10, maxticks=20))
    plt.xticks(rotation=30, ha='right', fontsize=11)
    plt.yticks(fontsize=11)
    plt.legend(title="平台", fontsize=12, title_fontsize=13, loc='upper left', bbox_to_anchor=(1.02, 1))
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout(rect=[0, 0, 0.88, 1])

    forecast_plot_filename = os.path.join(forecast_output_dir, f"ALL_platforms_sentiment_forecast_TransformerLSTM_{forecast_months}mo.svg")
    try:
        plt.savefig(forecast_plot_filename, dpi=300, bbox_inches='tight')
        print(f"预测图已保存至: {forecast_plot_filename}")
    except Exception as e:
        print(f"错误: 保存图像失败 - {e}")
        traceback.print_exc()

    plt.show()

print("\n--- 处理完成 ---")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import os
from datetime import datetime

# --- 用户配置区域 ---
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
# plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体为 SimHei # 在后面有更健壮的设置

# 数据文件所在的目录
save_dir = "E"
# 要处理的平台列表
platforms = ['dy', 'xhs', 'tieba']
# 评论数据所在的列名 (此脚本中主要用于参考，实际处理基于情感、点赞、时间列)
comment_column = 'content'
# 情感列名
sentiment_column = 'sentiment'
# 开始和结束时间 (格式: 'YYYY-MM-DD') - 请根据需要修改
start_date_str = '2024-02-01'
end_date_str = '2025-3-31'
# 聚合时间频率 ('D' for Daily, 'W' for Weekly, 'M' for Monthly, '2D' for 2 days etc.)
agg_freq = 'D'
# 输出图像文件名
output_plot_filename = os.path.join(save_dir, "sentiment_evolution_plot.svg")
# 输出数据文件名格式 (每个平台一个文件)
output_data_filename_template = os.path.join(save_dir, "{platform}_sentiment_timeseries_export_1.csv") # 使用模板

# --- 核心处理逻辑 ---
# 1. 设置中文显示字体
try:
    plt.rcParams['font.sans-serif'] = ['SimHei'] # 优先使用 SimHei
except:
    try:
        plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 备选 Microsoft YaHei
    except:
        print("警告：未找到 SimHei 或 Microsoft YaHei 字体，中文可能显示为方块。请安装中文字体。")
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 2. 定义情感到分数的映射
sentiment_mapping = {
    'Very Positive': 0.6,
    'Positive': 0.3,
    'Neutral': 0,
    'Negative': -0.25,
    'Very Negative': -0.5
}

# 3. 定义平台特定的列名
platform_configs = {
    'dy':    {'time_col': 'create_time', 'like_col': 'like_count'},
    'xhs':   {'time_col': 'time', 'like_col': 'like_count'},
    'tieba': {'time_col': 'publish_time', 'like_col': None}
}

# 4. 定义开始和结束时间戳
try:
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)
except ValueError:
    print(f"错误：开始 ({start_date_str}) 或结束日期 ({end_date_str}) 格式不正确，请使用 'YYYY-MM-DD' 格式。")
    exit()

# 存储每个平台处理后的时间序列数据
all_platform_sentiment = {}

# 5. 循环处理每个平台的数据
print("开始处理数据...")
for platform in platforms:
    print(f"\n--- 正在处理平台: {platform} ---") # 加个换行更清晰
    config = platform_configs[platform]
    time_col = config['time_col']
    like_col = config['like_col']
    # 直接构建文件名
    file_path = os.path.join(save_dir, f"{platform}_combined.csv")
    print(f"读取文件: {file_path}")

    try:
        # 读取数据，只读取需要的列以节省内存
        use_cols = [time_col, sentiment_column]
        if like_col:
            use_cols.append(like_col)

        if not os.path.exists(file_path):
             print(f"警告: 文件未找到 {file_path}。跳过此平台。")
             continue

        df = pd.read_csv(file_path, usecols=use_cols, low_memory=False)
        print(f"原始数据行数: {len(df)}")

        # --- 数据清洗和预处理 ---
        # (省略了详细的清洗步骤，保持和你的代码一致)
        # a. 处理时间列
        print(f"处理时间列: {time_col}")
        # ... (时间列清理和转换的代码) ...
        if not pd.api.types.is_string_dtype(df[time_col]) and not pd.api.types.is_object_dtype(df[time_col]):
             df[time_col] = df[time_col].astype(str)
        df[time_col] = df[time_col].fillna('')
        cleaned_time_col = df[time_col].str.replace(r'\s+', ' ', regex=True).str.strip()

        datetime_format = None
        if platform == 'dy':
             datetime_format = '%Y-%m-%d %H:%M:%S'
             df[time_col] = pd.to_datetime(cleaned_time_col, format=datetime_format, errors='coerce')
        else:
             df[time_col] = pd.to_datetime(cleaned_time_col, errors='coerce')

        invalid_time_mask = df[time_col].isna()
        num_total_invalid = invalid_time_mask.sum()
        if num_total_invalid > 0:
             print(f"时间列转换后发现 {num_total_invalid} 个无效值 (NaT)。")
             # (省略了详细的诊断输出代码)

        original_rows = len(df)
        df.dropna(subset=[time_col], inplace=True)
        removed_count = original_rows - len(df)
        if removed_count > 0:
             print(f"已根据 '{time_col}' 列删除 {removed_count} 行包含无效时间的数据。")
        print(f"处理后剩余有效时间数据行数: {len(df)}")

        if not df.empty:
            df[time_col] = df[time_col].dt.normalize()
        else:
             print(f"警告：平台 {platform} 在时间清洗后没有剩余数据。")
             continue # 如果没数据了，直接跳到下一个平台


        # c. 处理情感列
        print(f"处理情感列: {sentiment_column}")
        df['sentiment_score'] = df[sentiment_column].map(sentiment_mapping)
        original_rows = len(df)
        df.dropna(subset=['sentiment_score'], inplace=True)
        print(f"映射情感分数并行删除无效情感后行数: {len(df)} (移除了 {original_rows - len(df)} 行)")
        if df.empty:
            print(f"警告: 平台 {platform} 在映射情感分数后没有有效数据。跳过后续处理。")
            continue

        # d. 计算加权情感分数
        print("计算加权情感分数...")
        if like_col:
            print(f"使用点赞列: {like_col}")
            df[like_col] = pd.to_numeric(df[like_col], errors='coerce')
            df[like_col] = df[like_col].fillna(0)
            if (df[like_col] < 0).any():
                df[like_col] = df[like_col].clip(lower=0)
            df['weighted_sentiment'] = df['sentiment_score'] * (1+np.log1p(df[like_col]))
            # 限制范围
            df['weighted_sentiment'] = df['weighted_sentiment'].clip(lower=-0.35, upper=0.35)
            print(f"已将加权情感得分限制在 [-0.35, 0.35] 范围内。")
        else:
            print("无点赞列，使用原始情感分数。")
            df['weighted_sentiment'] = df['sentiment_score']
            df['weighted_sentiment'] = df['weighted_sentiment'].clip(lower=-0.35, upper=0.35)
            print("已将原始情感得分(作为加权得分)限制在 [-0.35, 0.35] 范围内。")


        # --- 按时间聚合 ---
        print(f"按时间聚合 (频率: {agg_freq})...")
        if df.empty or 'weighted_sentiment' not in df.columns:
             print(f"警告: 平台 {platform} 没有有效的加权情感数据进行聚合。")
             continue

        df.set_index(time_col, inplace=True)
        sentiment_ts = df['weighted_sentiment'].resample(agg_freq).mean()

        if sentiment_ts.empty:
            print(f"警告: 平台 {platform} 重采样后结果为空。")
            continue

        print(f"填充前 {platform} 时间序列长度: {len(sentiment_ts)}, NaN数量: {sentiment_ts.isna().sum()}")
        sentiment_ts.ffill(inplace=True)
        sentiment_ts.bfill(inplace=True)
        print(f"填充后 {platform} 时间序列长度: {len(sentiment_ts)}, NaN数量: {sentiment_ts.isna().sum()}")

        if sentiment_ts.isna().all():
             print(f"警告: 平台 {platform} 的聚合时间序列在填充后仍然全部是 NaN。跳过此平台。")
             continue

        all_platform_sentiment[platform] = sentiment_ts
        print(f"平台 {platform} 处理完成。聚合后时间点数量: {len(sentiment_ts)}")

        # --- 新增：导出聚合后的时间序列数据 ---
        if not sentiment_ts.empty:
            try:
                # 将 Series 转换为 DataFrame 以便导出
                # Series 的 index 是时间戳，values 是情感得分
                df_to_export = pd.DataFrame({
                    '时间': sentiment_ts.index, # 时间列
                    '平均加权情感得分': sentiment_ts.values # 得分列
                })
                # 可选：将时间列格式化为 'YYYY-MM-DD' 字符串，更便于 Excel 查看
                df_to_export['时间'] = df_to_export['时间'].dt.strftime('%Y-%m-%d')

                # 构建导出文件名
                export_filename = output_data_filename_template.format(platform=platform)
                # 导出到 CSV，使用 utf_8_sig 编码确保中文在 Excel 中正确显示
                df_to_export.to_csv(export_filename, index=False, encoding='utf_8_sig')
                print(f"已将 {platform.upper()} 平台聚合后的情感时间序列导出至: {export_filename}")
            except Exception as e:
                print(f"错误: 导出平台 {platform} 的时间序列数据时发生错误: {e}")
        else:
            # 这个分支理论上不会执行，因为前面有 isna().all() 的检查，但为了代码完整性保留
            print(f"平台 {platform} 的聚合时间序列为空，跳过导出。")
        # --- 导出代码结束 ---

    except FileNotFoundError:
        print(f"错误: 文件未找到 {file_path}")
    except KeyError as e:
        print(f"错误: 处理平台 {platform} 时列名错误 - {e}。请检查CSV文件中的列名是否与配置匹配。")
    except Exception as e:
        print(f"错误: 处理平台 {platform} 时发生意外错误: {e}")
        import traceback
        traceback.print_exc()

# 6. 绘图
print("\n开始绘制情感演变图...")
if not all_platform_sentiment:
    print("错误：没有成功处理任何平台的数据，无法绘图。请检查之前的处理步骤和数据。")
else:
    plt.figure(figsize=(18, 9))
    # sns.set_style("whitegrid")

    colors = ['#1f77b4', '#ff7f0e', 'green']
    color_map = {platform: colors[i] for i, platform in enumerate(all_platform_sentiment.keys())}

    for platform, sentiment_ts in all_platform_sentiment.items():
        if not sentiment_ts.empty:
            rolling_mean_ts = sentiment_ts.rolling(window=7, center=True, min_periods=1).mean()
            plt.plot(rolling_mean_ts.index, rolling_mean_ts.values,
                     label=f"{platform.upper()} 平台",
                     color=color_map[platform],
                     linewidth=2,
                     linestyle='-')
        else:
             print(f"警告: 平台 {platform} 没有聚合后的数据可供绘制。")


    plt.xlim(start_date, end_date)
    # plt.ylim(-0.5, 0.5) # 如果需要固定Y轴范围，取消注释

    plt.title(f'各平台对国产大模型情感演变趋势', fontsize=18, pad=20)
    plt.xlabel('日期', fontsize=14, labelpad=10)
    plt.ylabel('平均加权情感得分', fontsize=14, labelpad=10)

    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    if agg_freq == 'D':
        total_days = (end_date - start_date).days
        if total_days > 180:
             plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
             plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
        elif total_days > 60:
            plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
            plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
        else:
             plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
             plt.gca().xaxis.set_minor_locator(mdates.DayLocator(interval=1))
    elif agg_freq == 'W':
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
        plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(interval=1))
    else:
        plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(minticks=8, maxticks=15))

    plt.xticks(rotation=30, ha='right', fontsize=11)
    plt.yticks(fontsize=11)
    plt.axhline(0, color='grey', linestyle='--', linewidth=0.8, alpha=0.7)
    plt.legend(title="平台", fontsize=12, title_fontsize=13, loc='upper left', bbox_to_anchor=(1.02, 1))
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout(rect=[0, 0, 0.9, 1])

    try:
        plt.savefig(output_plot_filename, dpi=300, bbox_inches='tight')
        print(f"图像已保存至: {output_plot_filename}")
    except Exception as e:
        print(f"错误: 保存图像失败 - {e}")

    plt.show()

print("\n处理完成。")