In [24]:
# 导入所需的函数
import os
from pathlib import Path
import sys

# 获取当前 Notebook 的路径
notebook_path = os.getcwd()

print("当前 Notebook 的路径：", notebook_path)

# 获取项目根目录的路径
project_root = os.path.abspath(os.path.join(notebook_path, ".."))

# 切换当前工作目录到项目根目录
# 判断当前工作目录的后缀是否为"Pattern-Recognition"
if not notebook_path.endswith("Pattern-Recognition"):
    os.chdir(project_root)

# 将项目根目录添加到 Python 模块搜索路径
if project_root not in sys.path:
    sys.path.append(project_root)

print("当前工作目录：", os.getcwd())
print("当前的 Python 路径：")
print(sys.path)

import pandas as pd
import numpy as np
from src.data_preprocessing import prepare_lstm_data
from sklearn.discriminant_analysis import StandardScaler


def get_project_root():
    """
    获取项目根目录的 Path 对象。
    """
    return Path(os.getcwd())

当前 Notebook 的路径： d:\pycharm_workplace\Pattern-Recognition
当前工作目录： d:\pycharm_workplace\Pattern-Recognition
当前的 Python 路径：
['d:\\Anaconda3\\python312.zip', 'd:\\Anaconda3\\DLLs', 'd:\\Anaconda3\\Lib', 'd:\\Anaconda3', '', 'd:\\Anaconda3\\Lib\\site-packages', 'd:\\Anaconda3\\Lib\\site-packages\\win32', 'd:\\Anaconda3\\Lib\\site-packages\\win32\\lib', 'd:\\Anaconda3\\Lib\\site-packages\\Pythonwin', 'd:\\Anaconda3\\Lib\\site-packages\\setuptools\\_vendor', 'd:\\pycharm_workplace\\Pattern-Recognition', 'd:\\pycharm_workplace']


In [None]:
import joblib


def prepare_lstm_data(df: pd.DataFrame, sequence_length=10):
    """
    准备LSTM模型所需的数据格式。

    :param df: 预处理后的 DataFrame
    :param sequence_length: 序列长度

    :return: X, y 数据
    """
    # 读取高频内容
    processed_dir = get_project_root() / "data" / "processed"
    high_freq_file_path = (
        processed_dir / "high_freq_contents" / "high_freq_contents.csv"
    )
    high_freq_contents = pd.read_csv(high_freq_file_path)
    top_contents = high_freq_contents["content_id"].tolist()

    # 过滤高频内容的数据
    df_high = df[df["content_id"].isin(top_contents)]

    # 创建 pivot 表
    pivot_df = df_high.pivot_table(
        index="timestamp",
        columns="content_id",
        values="request_count",
        aggfunc="sum",
        fill_value=0,
    )

    # 确保时间序列连续
    pivot_df = pivot_df.asfreq("h", fill_value=0)

    # 归一化
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(pivot_df)

    # 保存 scaler
    scaler_path = processed_dir / "scaler.pkl"
    joblib.dump(scaler, scaler_path)
    print(f"Scaler 已保存到 {scaler_path}")

    # 构建序列数据
    X, y = [], []
    for i in range(len(scaled_data) - sequence_length):
        X.append(scaled_data[i : i + sequence_length])
        y.append(scaled_data[i + sequence_length])

    X = np.array(X)
    y = np.array(y)

    print(f"LSTM 输入数据形状：X={X.shape}, y={y.shape}")

    return X, y, scaler

In [26]:
from src.data_preprocessing import generate_synthetic_data, preprocess_data


df_raw = generate_synthetic_data()

df_processed, top_contents = preprocess_data(df_raw)

prepare_lstm_data(df_processed)

合成数据已保存到 D:\pycharm_workplace\Pattern-Recognition\data\raw
预处理后的数据已保存到 D:\pycharm_workplace\Pattern-Recognition\data\processed
高频内容已保存到 D:\pycharm_workplace\Pattern-Recognition\data\processed\high_freq_contents\high_freq_contents.csv
Scaler 已保存到 d:\pycharm_workplace\Pattern-Recognition\data\processed\scaler.pkl
LSTM 输入数据形状：X=(14, 10, 50), y=(14, 50)


  pivot_df = pivot_df.asfreq("H", fill_value=0)


(array([[[-0.66435534,  0.35624362, -1.39890195, ...,  2.2639618 ,
          -0.28997616, -1.14574311],
         [-0.66435534, -0.23749575,  0.10235868, ..., -0.8409001 ,
           0.34269909, -1.14574311],
         [ 1.05937743,  1.06873087, -1.39890195, ..., -0.8409001 ,
           0.97537434,  0.30151134],
         ...,
         [ 0.71463088,  0.94998299,  0.23883692, ..., -0.8409001 ,
          -0.9226514 , -0.42211588],
         [-1.61240836, -0.83123512,  0.5117934 , ...,  0.19405387,
          -0.28997616,  1.02513857],
         [-1.18147517, -0.83123512, -1.39890195, ..., -0.32342311,
          -0.28997616,  0.30151134]],
 
        [[-0.66435534, -0.23749575,  0.10235868, ..., -0.8409001 ,
           0.34269909, -1.14574311],
         [ 1.05937743,  1.06873087, -1.39890195, ..., -0.8409001 ,
           0.97537434,  0.30151134],
         [-0.31960878, -1.06873087,  1.87657578, ...,  0.19405387,
           2.24072484, -1.14574311],
         ...,
         [-1.61240836, -0.8312351