In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.stats import zscore
import pandas as pd
import os

In [2]:
# 1. 加载滚动数据
data = np.load('processed_data/processed_data_rolling.npz')

# 回归任务数据
X_reg_scaled = data['X_reg_scaled']
y_reg = data['y_reg']

# 分类任务数据
X_cls_scaled = data['X_cls_scaled']
y_cls = data['y_cls']

# 创建保存目录
os.makedirs('features_rolling', exist_ok=True)


In [3]:
# 2. PCA（对回归数据）
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_reg_scaled)

In [4]:
# 3-1. Select(10)Best for regression
selector_reg = SelectKBest(score_func=f_regression, k=10)
X_selected_reg = selector_reg.fit_transform(X_reg_scaled, y_reg)

# 4-1. Select(10)Best for classification
selector_cls = SelectKBest(score_func=f_classif, k=10)
X_selected_cls = selector_cls.fit_transform(X_cls_scaled, y_cls)


In [5]:
# 3-2. Select(50)Best for regression
selector_reg = SelectKBest(score_func=f_regression, k=50)
X_selected_reg_50 = selector_reg.fit_transform(X_reg_scaled, y_reg)

# 4-2. Select(50)Best for classification
selector_cls = SelectKBest(score_func=f_classif, k=50)
X_selected_cls_50 = selector_cls.fit_transform(X_cls_scaled, y_cls)

In [6]:
print("回归数据:", X_selected_reg.shape)
print("分类数据:", X_selected_cls.shape)
print("回归数据:", X_selected_reg_50.shape)
print("分类数据:", X_selected_cls_50.shape)
print("回归数据:", X_pca.shape)

回归数据: (72054, 10)
分类数据: (72370, 10)
回归数据: (72054, 50)
分类数据: (72370, 50)
回归数据: (72054, 20)


**清洗10 Best Reg**

In [21]:
# 确保 y_reg 是 Series（可选：只需在早期保持即可）
y_reg_series = pd.Series(y_reg, name='Number of nights in CITY')

# 合并成 DataFrame
df_reg = pd.concat([pd.DataFrame(X_selected_reg), y_reg_series], axis=1)

# 计算 Z-score
z_scores_reg = np.abs(zscore(df_reg))
threshold = 3.0
df_reg_cleaned = df_reg[(z_scores_reg < threshold).all(axis=1)]

# 拆分回 X 和 y
X_reg_cleaned = df_reg_cleaned.drop(columns='Number of nights in CITY').values
y_reg_cleaned = df_reg_cleaned['Number of nights in CITY'].values

**清洗10 Best Cls**

In [22]:
# 确保 y_reg 是 Series（可选：只需在早期保持即可）
y_cls_series = pd.Series(y_cls, name='Purpose of visit to CITY')

# 合并特征和标签
df_cls = pd.concat([pd.DataFrame(X_selected_cls), y_cls_series], axis=1)

# Z-score 清洗（仅对特征列）
z_scores_cls = np.abs(zscore(df_cls.iloc[:, :-1]))
threshold = 3.0
df_cls_cleaned = df_cls[(z_scores_cls < threshold).all(axis=1)]

# 拆分回清洗后的 X 和 y
X_cls_cleaned = df_cls_cleaned.iloc[:, :-1].values
y_cls_cleaned = df_cls_cleaned['Purpose of visit to CITY'].values

**清洗50 Best Reg**

In [23]:
# 确保 y 是 Series
y_reg_series = pd.Series(y_reg, name='target')

# 合并 X 和 y
df_reg_50 = pd.concat([pd.DataFrame(X_selected_reg_50), y_reg_series], axis=1)

# 计算 Z-score 并清洗
z_scores_reg_50 = np.abs(zscore(df_reg_50))
threshold = 3.0  # 可调节
df_reg_cleaned_50 = df_reg_50[(z_scores_reg_50 < threshold).all(axis=1)]

# 拆分清洗后的数据
X_reg_cleaned_50 = df_reg_cleaned_50.drop(columns='target').values
y_reg_cleaned_50 = df_reg_cleaned_50['target'].values


**清洗50 Best Cls**

In [24]:
# 确保 y 是 Series
y_cls_series = pd.Series(y_cls, name='target')

# 合并 X 和 y
df_cls_50 = pd.concat([pd.DataFrame(X_selected_cls_50), y_cls_series], axis=1)

# 仅对特征部分计算 Z-score
z_scores_cls_50 = np.abs(zscore(df_cls_50.iloc[:, :-1]))
threshold = 3.0
df_cls_cleaned_50 = df_cls_50[(z_scores_cls_50 < threshold).all(axis=1)]

# 拆分清洗后的数据
X_cls_cleaned_50 = df_cls_cleaned_50.iloc[:, :-1].values
y_cls_cleaned_50 = df_cls_cleaned_50['target'].values

In [25]:
print("回归数据:", X_reg_cleaned.shape)
print("分类数据:", X_cls_cleaned.shape)
print("回归数据:", X_reg_cleaned_50.shape)
print("分类数据:", X_cls_cleaned_50.shape)

回归数据: (58938, 10)
分类数据: (56643, 10)
回归数据: (39702, 50)
分类数据: (45583, 50)


In [26]:
# 5. LDA（用于分类数据）
n_classes = len(np.unique(y_cls))
lda = LinearDiscriminantAnalysis(n_components=min(n_classes - 1, X_cls_scaled.shape[1] - 1))
X_lda = lda.fit_transform(X_cls_scaled, y_cls)

In [27]:
# === 保存处理结果 ===
np.savez('features_rolling/pca_features.npz',
         X_pca=X_pca,
         y_reg=y_reg)

np.savez('features_rolling/select10best_regression_features.npz',
         X_selected_reg=X_selected_reg,
         y_reg=y_reg)

np.savez('features_rolling/select10best_classification_features.npz',
         X_selected_cls=X_selected_cls,
         y_cls=y_cls)

np.savez('features_rolling/select50best_regression_features.npz',
         X_selected_reg_50=X_selected_reg_50,
         y_reg=y_reg)

np.savez('features_rolling/select50best_classification_features.npz',
         X_selected_cls_50=X_selected_cls_50,
         y_cls=y_cls)

np.savez('features_rolling/select10best_cleaned_regression_features.npz',
         X_reg_cleaned=X_reg_cleaned,
         y_reg=y_reg_cleaned)

np.savez('features_rolling/select10best_cleaned_classification_features.npz',
         X_cls_cleaned=X_cls_cleaned,
         y_cls=y_cls_cleaned)

np.savez('features_rolling/lda_features.npz',
         X_lda=X_lda,
         y_cls=y_cls)



In [28]:
# 输出维度信息
print("PCA 降维后数据维度:", X_pca.shape)
print("SelectKBest-回归特征维度:", X_selected_reg.shape)
print("SelectKBest-分类特征维度:", X_selected_cls.shape)
print("SelectKBest_清洗-回归特征维度:", X_reg_cleaned.shape)
print("SelectKBest_清洗-分类特征维度:", X_cls_cleaned.shape)
print("SelectKBest_50-回归特征维度:", X_selected_reg_50.shape)
print("SelectKBest_50-分类特征维度:", X_selected_cls_50.shape)
print("LDA 降维后特征维度:", X_lda.shape)

PCA 降维后数据维度: (72054, 20)
SelectKBest-回归特征维度: (72054, 10)
SelectKBest-分类特征维度: (72370, 10)
SelectKBest_清洗-回归特征维度: (58938, 10)
SelectKBest_清洗-分类特征维度: (56643, 10)
SelectKBest_50-回归特征维度: (72054, 50)
SelectKBest_50-分类特征维度: (72370, 50)
LDA 降维后特征维度: (72370, 3)
