In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 1. 加载数据
# 假设 CSV 文件名为 'data.csv'，其中包含特征列和目标列
data = pd.read_csv('../data/DataProcess/station/1037A/1037A_final.csv')

# 指定特征列和目标列
feature_columns = ['PM10', 'SO2', 'NO2', 'O3', 'CO', 'Dew_Point_2m',
       'Relative_Humidity_2m(%)', 'Temperature', 'Wind_Direction_10m',
       'Wind_Speed_10m(km/h)', 'Diffuse_Radiation(W/m2)',
       'Direct_Radiation(W/m2)', 'Precipitation(mm)',
       'Shortwave_Radiation(W/m2)', 'Surface_Pressure(hPa)']  # 替换为实际特征列名
target_column = 'Target'  # 替换为实际目标列名

X = data[feature_columns]
y = data[target_column]



In [None]:

# 2. 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. PCA 降维
pca = PCA(n_components=2)  # 将数据降维到2维
X_pca = pca.fit_transform(X_scaled)

# 4. 查看降维结果和主成分贡献率
print("主成分贡献率：", pca.explained_variance_ratio_)
print("累计贡献率：", np.cumsum(pca.explained_variance_ratio_))

# 5. 将降维结果与目标列组合
pca_result = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_result[target_column] = y

# 保存结果
pca_result.to_csv('pca_result.csv', index=False)
print("降维后的数据已保存到 'pca_result.csv'")

In [3]:
# 计算相关系数
correlations = X.corrwith(y)

# 输出每个特征对目标的贡献度（相关系数绝对值越大，贡献越高）
print("特征对目标的相关系数：\n", correlations)

特征对目标的相关系数：
 PM10                         0.726276
SO2                          0.344972
NO2                          0.560962
O3                          -0.212365
CO                           0.557015
Dew_Point_2m                -0.067953
Relative_Humidity_2m(%)      0.226425
Temperature                 -0.233247
Wind_Direction_10m          -0.033997
Wind_Speed_10m(km/h)        -0.159207
Diffuse_Radiation(W/m2)     -0.138968
Direct_Radiation(W/m2)      -0.130599
Precipitation(mm)           -0.053576
Shortwave_Radiation(W/m2)   -0.141990
Surface_Pressure(hPa)        0.105757
dtype: float64
