In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

# 计算评估指标的函数
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, mape, r2

# 创建并编译 LSTM 模型，增加L2正则化并调整结构
def build_lstm_model(input_shape, output_shape):
    model = Sequential([
        Bidirectional(LSTM(64, return_sequences=True, input_shape=input_shape, kernel_regularizer=l2(0.001))),
        Dropout(0.2),  # 减少Dropout
        Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=l2(0.001))),
        Dropout(0.2),
        LSTM(16, return_sequences=False, kernel_regularizer=l2(0.001)),  # 简化最后的LSTM层
        Dropout(0.2),
        Dense(output_shape)  # 输出层
    ])
    model.compile(optimizer='RMSprop', loss='mean_squared_error')  # 使用RMSprop优化器
    return model

# 训练和验证 LSTM 模型
def train_and_evaluate_lstm(X_train, y_train, X_test, y_test):
    model = build_lstm_model((X_train.shape[1], X_train.shape[2]), y_train.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)  # 调整早停策略
    model.fit(X_train, y_train, epochs=150, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)
    return model

# 预测并计算组合模型结果，加入加权融合
def predict_combined_model(X_test, model_lstm, model_xgb, lstm_weight=0.5):
    y_pred_lstm = model_lstm.predict(X_test)
    y_pred_xgb = model_xgb.predict(X_test.reshape(X_test.shape[0], -1))
    return (lstm_weight * y_pred_lstm + (1 - lstm_weight) * y_pred_xgb)  # 加权融合

# 读取数据并处理
grouped_avg_properties_sa2_df = pd.read_csv('d:/Users/25453/Documents/GitHub/https---github.com-liamhodg-MAST30034_Python.git/project-2-group-real-estate-industry-project-5/grouped_avg_properties_sa2(3).csv')
increase_rates_df = pd.read_csv('d:/Users/25453/Documents/GitHub/https---github.com-liamhodg-MAST30034_Python.git/project-2-group-real-estate-industry-project-5/Updated_Rent_Data (1).csv')

# 查找共同的 SA2 区域并处理
common_sa2_names = np.intersect1d(grouped_avg_properties_sa2_df['SA2_NAME21'], increase_rates_df['District'])
grouped_avg_properties_sa2_df = grouped_avg_properties_sa2_df[grouped_avg_properties_sa2_df['SA2_NAME21'].isin(common_sa2_names)].reset_index(drop=True)
increase_rates_df = increase_rates_df[increase_rates_df['District'].isin(common_sa2_names)].reset_index(drop=True)

# 提取租金数据和外部特征
rent_columns = [col for col in increase_rates_df.columns if 'Rent' in col]
rent_data = increase_rates_df[rent_columns].values
external_features = grouped_avg_properties_sa2_df[['Distance (km)', 'School Distance (km)', 'Distance to Closest Shopping Center (km)', 'Price']].values

# 归一化处理
scaler_rent = MinMaxScaler()
scaler_features = MinMaxScaler()
rent_data_scaled = scaler_rent.fit_transform(rent_data)
external_features_scaled = scaler_features.fit_transform(external_features)

# 填充缺失值并创建时间序列数据
rent_data_filled = pd.DataFrame(rent_data_scaled).fillna(method='bfill').fillna(method='ffill').values

# 创建输入数据，包括外部特征
sequence_length = 10  # 增加时间序列滑窗长度
X, y = [], []
for i in range(sequence_length, rent_data_filled.shape[0]):
    X_sequence_rent = rent_data_filled[i-sequence_length:i]
    X_sequence_external = np.tile(external_features_scaled[i], (sequence_length, 1))
    X_sequence = np.concatenate((X_sequence_rent, X_sequence_external), axis=1)
    X.append(X_sequence)
    y.append(rent_data_filled[i])

X, y = np.array(X), np.array(y)

# 交叉验证和评估
tscv = TimeSeriesSplit(n_splits=5)
metrics_list = []

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # 训练 LSTM 和 XGBoost 模型
    model_lstm = train_and_evaluate_lstm(X_train, y_train, X_test, y_test)
    model_xgb = XGBRegressor(n_estimators=200, learning_rate=0.01)  # 调低学习率，增加树的数量
    model_xgb.fit(X_train.reshape(X_train.shape[0], -1), y_train)

    # 预测并评估
    y_pred_combined = predict_combined_model(X_test, model_lstm, model_xgb, lstm_weight=0.6)  # LSTM加权60%
    mae, mse, rmse, mape, r2 = calculate_metrics(y_test, y_pred_combined)
    metrics_list.append([mae, mse, rmse, mape, r2])
    print(f"Fold {fold + 1} - MAE: {mae}, MSE: {mse}, RMSE: {rmse}, MAPE: {mape}%, R^2: {r2}")

# 输出平均评估指标
metrics_avg = np.mean(metrics_list, axis=0)
print(f"Cross-validated MAE: {metrics_avg[0]}, MSE: {metrics_avg[1]}, RMSE: {metrics_avg[2]}, MAPE: {metrics_avg[3]}%, R^2: {metrics_avg[4]}")

# 最终模型训练与未来预测
model_final_lstm = train_and_evaluate_lstm(X, y, X, y)
model_final_xgb = XGBRegressor(n_estimators=200, learning_rate=0.01)
model_final_xgb.fit(X.reshape(X.shape[0], -1), y)

# 使用组合模型在整个数据集上进行预测
y_pred_final = predict_combined_model(X, model_final_lstm, model_final_xgb, lstm_weight=0.6)

# 反归一化预测结果和真实值
y_pred_final = scaler_rent.inverse_transform(y_pred_final)
y_true_final = scaler_rent.inverse_transform(y)

# 计算最终的评估指标
mae_final, mse_final, rmse_final, mape_final, r2_final = calculate_metrics(y_true_final, y_pred_final)

# 输出最终评估指标
print(f"Final model MAE: {mae_final}")
print(f"Final model MSE: {mse_final}")
print(f"Final model RMSE: {rmse_final}")
print(f"Final model MAPE: {mape_final}%")
print(f"Final model R-squared: {r2_final}")

# 生成未来12个季度预测
predicted_rent_prices_all = []
future_steps = 12
for i in range(len(grouped_avg_properties_sa2_df)):
    X_input_rent = rent_data_filled[-sequence_length:]
    X_input_external = np.tile(external_features_scaled[i], (sequence_length, 1))
    X_input = np.concatenate((X_input_rent, X_input_external), axis=1)
    X_input = np.expand_dims(X_input, axis=0)

    predicted_rent_sequence = []
    for step in range(future_steps):
        predicted_rent = model_final_lstm.predict(X_input)
        new_rent_data = predicted_rent.reshape(1, 1, -1)
        new_rent_with_features = np.concatenate((new_rent_data, np.tile(external_features_scaled[i], (1, 1, 1))), axis=2)
        X_input = np.concatenate([X_input[:, 1:, :], new_rent_with_features], axis=1)
        predicted_rent_sequence.append(scaler_rent.inverse_transform(predicted_rent)[0][0])

    predicted_rent_prices_all.append(np.array(predicted_rent_sequence).flatten()[:future_steps])

# 将预测结果转换为合适的格式
predicted_rent_prices_all = np.array(predicted_rent_prices_all)
future_dates = pd.date_range(start='2025-01-01', periods=future_steps, freq='Q').strftime('%Y-%m-%d')
predicted_rent_prices_df = pd.DataFrame(predicted_rent_prices_all, columns=future_dates)
predicted_rent_prices_df.insert(0, 'SA2_NAME', grouped_avg_properties_sa2_df['SA2_NAME21'])
predicted_rent_prices_df.to_csv('predicted_rent_prices_2025_2027_optimized_ensemble.csv', index=False)

# 输出查看预测结果
print(predicted_rent_prices_df.head())


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

# 计算评估指标的函数
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, mape, r2

# 创建并编译 LSTM 模型，恢复部分复杂性，调整 Dropout 和 L2 正则化
def build_lstm_model(input_shape, output_shape):
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True, input_shape=input_shape, kernel_regularizer=l2(0.001))),
        Dropout(0.3),  # 增加 Dropout 防止过拟合
        Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001))),
        Dropout(0.3),
        LSTM(32, return_sequences=False, kernel_regularizer=l2(0.001)),
        Dropout(0.3),
        Dense(output_shape)  # 输出层
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')  # 使用 Adam 优化器
    return model

# 训练和验证 LSTM 模型，增加 patience 和调整 batch_size
def train_and_evaluate_lstm(X_train, y_train, X_test, y_test):
    model = build_lstm_model((X_train.shape[1], X_train.shape[2]), y_train.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)  # 增加 patience，防止过早停止
    model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)
    return model

# 预测并计算组合模型结果，加入加权融合，降低 LSTM 权重
def predict_combined_model(X_test, model_lstm, model_xgb, lstm_weight=0.4):  # XGBoost 权重加大，LSTM 权重 40%
    y_pred_lstm = model_lstm.predict(X_test)
    y_pred_xgb = model_xgb.predict(X_test.reshape(X_test.shape[0], -1))
    return (lstm_weight * y_pred_lstm + (1 - lstm_weight) * y_pred_xgb)

# 读取数据并处理
grouped_avg_properties_sa2_df = pd.read_csv('d:/Users/25453/Documents/GitHub/https---github.com-liamhodg-MAST30034_Python.git/project-2-group-real-estate-industry-project-5/grouped_avg_properties_sa2(3).csv')
increase_rates_df = pd.read_csv('d:/Users/25453/Documents/GitHub/https---github.com-liamhodg-MAST30034_Python.git/project-2-group-real-estate-industry-project-5/Updated_Rent_Data (1).csv')

# 查找共同的 SA2 区域并处理
common_sa2_names = np.intersect1d(grouped_avg_properties_sa2_df['SA2_NAME21'], increase_rates_df['District'])
grouped_avg_properties_sa2_df = grouped_avg_properties_sa2_df[grouped_avg_properties_sa2_df['SA2_NAME21'].isin(common_sa2_names)].reset_index(drop=True)
increase_rates_df = increase_rates_df[increase_rates_df['District'].isin(common_sa2_names)].reset_index(drop=True)

# 提取租金数据和外部特征
rent_columns = [col for col in increase_rates_df.columns if 'Rent' in col]
rent_data = increase_rates_df[rent_columns].values
external_features = grouped_avg_properties_sa2_df[['Distance (km)', 'School Distance (km)', 'Distance to Closest Shopping Center (km)', 'Price']].values

# 归一化处理
scaler_rent = MinMaxScaler()
scaler_features = MinMaxScaler()
rent_data_scaled = scaler_rent.fit_transform(rent_data)
external_features_scaled = scaler_features.fit_transform(external_features)

# 填充缺失值并创建时间序列数据
rent_data_filled = pd.DataFrame(rent_data_scaled).fillna(method='bfill').fillna(method='ffill').values

# 创建输入数据，包括外部特征
sequence_length = 12  # 增加时间序列滑窗长度
X, y = [], []
for i in range(sequence_length, rent_data_filled.shape[0]):
    X_sequence_rent = rent_data_filled[i-sequence_length:i]
    X_sequence_external = np.tile(external_features_scaled[i], (sequence_length, 1))
    X_sequence = np.concatenate((X_sequence_rent, X_sequence_external), axis=1)
    X.append(X_sequence)
    y.append(rent_data_filled[i])

X, y = np.array(X), np.array(y)

# 交叉验证和评估
tscv = TimeSeriesSplit(n_splits=5)
metrics_list = []

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # 训练 LSTM 和 XGBoost 模型
    model_lstm = train_and_evaluate_lstm(X_train, y_train, X_test, y_test)
    
    # XGBoost 参数调整，增加树的数量并调高 max_depth 和 min_child_weight
    model_xgb = XGBRegressor(n_estimators=300, learning_rate=0.01, max_depth=5, min_child_weight=3)
    model_xgb.fit(X_train.reshape(X_train.shape[0], -1), y_train)

    # 预测并评估
    y_pred_combined = predict_combined_model(X_test, model_lstm, model_xgb, lstm_weight=0.4)  # LSTM 权重降为 40%
    mae, mse, rmse, mape, r2 = calculate_metrics(y_test, y_pred_combined)
    metrics_list.append([mae, mse, rmse, mape, r2])
    print(f"Fold {fold + 1} - MAE: {mae}, MSE: {mse}, RMSE: {rmse}, MAPE: {mape}%, R²: {r2}")

# 输出平均评估指标
metrics_avg = np.mean(metrics_list, axis=0)
print(f"Cross-validated MAE: {metrics_avg[0]}, MSE: {metrics_avg[1]}, RMSE: {metrics_avg[2]}, MAPE: {metrics_avg[3]}%, R²: {metrics_avg[4]}")

# 最终模型训练与未来预测
model_final_lstm = train_and_evaluate_lstm(X, y, X, y)
model_final_xgb = XGBRegressor(n_estimators=300, learning_rate=0.01, max_depth=5, min_child_weight=3)
model_final_xgb.fit(X.reshape(X.shape[0], -1), y)

# 使用组合模型在整个数据集上进行预测
y_pred_final = predict_combined_model(X, model_final_lstm, model_final_xgb, lstm_weight=0.4)

# 反归一化预测结果和真实值
y_pred_final = scaler_rent.inverse_transform(y_pred_final)
y_true_final = scaler_rent.inverse_transform(y)

# 计算最终的评估指标
mae_final, mse_final, rmse_final, mape_final, r2_final = calculate_metrics(y_true_final, y_pred_final)

# 输出最终评估指标
print(f"Final model MAE: {mae_final}")
print(f"Final model MSE: {mse_final}")
print(f"Final model RMSE: {rmse_final}")
print(f"Final model MAPE: {mape_final}%")
print(f"Final model R²: {r2_final}")

# 生成未来12个季度预测
predicted_rent_prices_all = []
future_steps = 12
for i in range(len(grouped_avg_properties_sa2_df)):
    X_input_rent = rent_data_filled[-sequence_length:]
    X_input_external = np.tile(external_features_scaled[i], (sequence_length, 1))
    X_input = np.concatenate((X_input_rent, X_input_external), axis=1)
    X_input = np.expand_dims(X_input, axis=0)

    predicted_rent_sequence = []
    for step in range(future_steps):
        predicted_rent = model_final_lstm.predict(X_input)
        new_rent_data = predicted_rent.reshape(1, 1, -1)
        new_rent_with_features = np.concatenate((new_rent_data, np.tile(external_features_scaled[i], (1, 1, 1))), axis=2)
        X_input = np.concatenate([X_input[:, 1:, :], new_rent_with_features], axis=1)
        predicted_rent_sequence.append(scaler_rent.inverse_transform(predicted_rent)[0][0])

    predicted_rent_prices_all.append(np.array(predicted_rent_sequence).flatten()[:future_steps])

# 将预测结果转换为合适的格式
predicted_rent_prices_all = np.array(predicted_rent_prices_all)
future_dates = pd.date_range(start='2025-01-01', periods=future_steps, freq='Q').strftime('%Y-%m-%d')
predicted_rent_prices_df = pd.DataFrame(predicted_rent_prices_all, columns=future_dates)
predicted_rent_prices_df.insert(0, 'SA2_NAME', grouped_avg_properties_sa2_df['SA2_NAME21'])
predicted_rent_prices_df.to_csv('predicted_rent_prices_2025_2027_optimized_ensemble_2.csv', index=False)

# 输出查看预测结果
print(predicted_rent_prices_df.head())
