In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
 # Load data
data = pd.read_csv(r"D:\DL_Homework\Kaggle2_Titanic\统计建模\数据\data-1.csv", encoding='gb18030')
 # Convert date column to datetime format and extract year, month, and day
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
 # Set date column as index and convert certain columns to float
data = data.set_index('date')
cols_to_convert = ['AQI指数', 'PM2.5', 'PM10', 'O3', 'no2', 'so2', 'co', 'T', 'Po', 'U', 'Ff', 'VV', 'RRR', 'year', 'month', 'day']
data[cols_to_convert] = data[cols_to_convert].astype(float)
# data.isnull().sum()

In [10]:
 # Define lagged features and rolling statistics
lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
rolling_windows = [3, 7, 14]
data_rolled = data.copy()
for lag in lags:
    data_shifted = data.copy().shift(lag)
    data_shifted.columns = [f"{col}_lag{lag}" for col in data_shifted.columns]
    data_rolled = pd.concat([data_rolled, data_shifted], axis=1)
for col in data.columns:
    for window in rolling_windows:
        data_rolled[f"{col}_MA{window}"] = data[col].rolling(window).mean().shift(1)
 # Split data into training and testing sets
train = data_rolled[data_rolled.index < '2022-01-01']
test = data_rolled[data_rolled.index >= '2022-01-01']
 # Define base models
rf_model = RandomForestRegressor(n_estimators=100)
gb_model = GradientBoostingRegressor(n_estimators=100)
lr_model = LinearRegression()
 # Define time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
 # Define pipeline for base models
rf_pipeline = Pipeline([('rf', rf_model)])
gb_pipeline = Pipeline([('gb', gb_model)])
lr_pipeline = Pipeline([('lr', lr_model)])
 # Train and validate base models using time-series cross-validation
rf_scores = []
gb_scores = []
lr_scores = []

train_cv = train.iloc[train_index]
val_cv = train.iloc[val_index]
train_cv_x = train_cv.drop('AQI指数', axis=1)
train_cv_y = train_cv['AQI指数']
print(train_cv_x.isnull().sum())
for train_index, val_index in tscv.split(train):
     # Fit random forest pipeline
    rf_pipeline.fit(train_cv_x, train_cv_y)
    rf_pred = rf_pipeline.predict(val_cv.drop('AQI指数', axis=1))
    rf_scores.append(mean_absolute_error(val_cv['AQI指数'], rf_pred))
     # Fit gradient boosting pipeline
    gb_pipeline.fit(train_cv_x, train_cv_y)
    gb_pred = gb_pipeline.predict(val_cv.drop('AQI指数', axis=1))
    gb_scores.append(mean_absolute_error(val_cv['AQI指数'], gb_pred))
     # Fit linear regression pipeline
    lr_pipeline.fit(train_cv_x, train_cv_y)
    lr_pred = lr_pipeline.predict(val_cv.drop('AQI指数', axis=1))
    lr_scores.append(mean_absolute_error(val_cv['AQI指数'], lr_pred))
print(f"Random Forest MAE: {np.mean(rf_scores)}")
print(f"Gradient Boosting MAE: {np.mean(gb_scores)}")
print(f"Linear Regression MAE: {np.mean(lr_scores)}")
 # Stack base model predictions and train meta-model
train_rf_pred = rf_pipeline.predict(train.drop('AQI指数', axis=1))
train_gb_pred = gb_pipeline.predict(train.drop('AQI指数', axis=1))
train_lr_pred = lr_pipeline.predict(train.drop('AQI指数', axis=1))
train_meta = pd.DataFrame({'rf_pred': train_rf_pred, 'gb_pred': train_gb_pred, 'lr_pred': train_lr_pred})
meta_model = LinearRegression()
meta_model.fit(train_meta, train['AQI指数'])
 # Generate final predictions and evaluate performance on test set
test_rf_pred = rf_pipeline.predict(test.drop('AQI指数', axis=1))
test_gb_pred = gb_pipeline.predict(test.drop('AQI指数', axis=1))
test_lr_pred = lr_pipeline.predict(test.drop('AQI指数', axis=1))
test_meta = pd.DataFrame({'rf_pred': test_rf_pred, 'gb_pred': test_gb_pred, 'lr_pred': test_lr_pred})
final_pred = meta_model.predict(test_meta)
print(f"Test set MAE: {mean_absolute_error(test['AQI指数'], final_pred)}")

PM2.5          0
PM10           0
O3             0
no2            0
so2            0
              ..
month_MA7      7
month_MA14    14
day_MA3        3
day_MA7        7
day_MA14      14
Length: 223, dtype: int64


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values