### 思路：先将 water input 和 Tem 值缺失的地方用0补充，然后用先前的RandomForest模型进行预测。

In [1]:
# 加载包
import os
import time
import pandas as pd
import numpy as np
from joblib import load
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
# 确定训练集里的变量排列顺序、BOX_COX变换对象和预测模型文件名
file_path = '/Users/xihuan/Desktop/土壤信息数据集/4.数据整理/prediction/'
xlsx_file_names = [file for file in os.listdir(file_path) if file.endswith('.xlsx')]
xlsx_file_names

['clear_datas124.xlsx',
 'clear_datas134.xlsx',
 'clear_datas123.xlsx',
 'Supplementary(6.2晚).xlsx',
 'clear_datas1234.xlsx',
 'Supplementary(6.8晚).xlsx',
 'clear_datas3.xlsx',
 'clear_datas234.xlsx',
 'Supplementary(6.6晚).xlsx',
 'clear_datas3(9.2).xlsx']

In [3]:
# 确定要计算的网格文件名
file_path_grid = '/Users/xihuan/Desktop/土壤信息数据集/4.数据整理/compile_data_result/'
file_names = [file for file in os.listdir(file_path_grid) if file.startswith('grid_num_') and file.endswith('.csv')]

In [None]:
for xlsx_file in xlsx_file_names:
    # 加载训练集里的变量排列顺序、BOX_COX变换对象和预测模型
    col_list = list(load(file_path+f'/save_file/x_columns({xlsx_file.rsplit('.',1)[0]}).joblib'))
    RF1 = load(file_path+f'/save_file/random_forest_model({xlsx_file.rsplit('.',1)[0]}).joblib')
    pt = load(file_path+f'/save_file/pt_transformer({xlsx_file.rsplit('.',1)[0]}).joblib')

    # 记录开始时间
    start_time = time.time()  # 返回当前时间的时间戳（秒）

    grid_num = []
    EF_predicted = []
    for filename in file_names:
    
        df_clear = pd.read_csv(file_path_grid+filename)
        df_clear = df_clear.fillna(0)
        
        #将分类变量、数值变量名称分隔开，便于后续数据处理
        col_categorical = ['Fertilizer type', 'Nitrogen placement', 'Soil tillage practices', 'Crop type']
        col_numerical = list(set(df_clear.columns)-set(col_categorical))
        
        #现将分类变量进行热码处理，然后再拼接上数值型变量，顺序不能变
        df1 = pd.get_dummies(df_clear[col_categorical]).astype(np.int64)
        df1[col_numerical] = df_clear[col_numerical]
        
        #处理施肥次数作为肥料管理策略实施的依据    
        df1['Fertilizer application time'] = [1 if x >= 2 else 0 for x in df1['Fertilizer application time']]
        
        #简化变量名称
        new_names = {'Fertilizer type_AN': 'Ftype(AN)',
                     'Fertilizer type_EEF': 'Ftype(EEF)',             
                     'Fertilizer type_Manure': 'Ftype(Manure)',
                     'Fertilizer type_Others': 'Ftype(Others)',
                     'Fertilizer type_U': 'Ftype(U)',
                     'Nitrogen placement_DPM': 'NP(DPM)',
                     'Nitrogen placement_Mix': 'NP(Mix)',
                     'Nitrogen placement_SBC': 'NP(SBC)',
                     'Soil tillage practices_CT': 'STP(CT)',
                     'Soil tillage practices_NT': 'STP(NT)',
                     'Crop type_Maize': 'Ctype(Maize)',
                     'Crop type_Rice': 'Ctype(Rice)',
                     'Crop type_Wheat': 'Ctype(Wheat)',
                     'Fertilizer application time': 'FAT',
                     'CEC': 'CEC',
                     'Clay': 'Clay',
                     'pH': 'pH',
                     'TN': 'TN',
                     'BD': 'BD',
                     'SOC': 'SOC',
                     'Water input': 'Water input',
                     'Tem': 'Tem'}
        
        df1.rename(columns=new_names, inplace=True)
        
        # 加载训练集里的变量排列顺序
        df1 = df1.loc[:, col_list]
        
        # 加载模型并预测
        EFs_pred = RF1.predict(df1)
        
        # 加载BOX_COX变换对象并将预测结果进行Box-Cox逆变换
        EF_pred = pt.inverse_transform(EFs_pred.reshape(-1, 1)).ravel()
        
        # 储存结果
        grid_num.append(int(filename.rsplit('_',1)[1].rsplit('.',1)[0]))
        EF_predicted.append(round(EF_pred.mean(),2))
    
    result_predicted = pd.DataFrame({'grid_num':grid_num, 'EF_predicted': EF_predicted}).sort_values(by='grid_num', ascending=True).reset_index(drop=True)
    result_predicted.to_csv(file_path+f"/EF_prediction/EF_predicted({xlsx_file.rsplit('.',1)[0]}).csv", index=False)

    # 记录结束时间
    end_time = time.time()
    # 计算耗时
    elapsed_time = end_time - start_time
    print(f"耗时: {elapsed_time/60:.2f} 分")

In [4]:
xlsx_file = xlsx_file_names[-5]
# 加载训练集里的变量排列顺序、BOX_COX变换对象和预测模型
col_list = list(load(file_path+f'/save_file/x_columns({xlsx_file.rsplit('.',1)[0]}).joblib'))
RF1 = load(file_path+f'/save_file/random_forest_model({xlsx_file.rsplit('.',1)[0]}).joblib')
pt = load(file_path+f'/save_file/pt_transformer({xlsx_file.rsplit('.',1)[0]}).joblib')

# 记录开始时间
start_time = time.time()  # 返回当前时间的时间戳（秒）

grid_num = []
EF_predicted = []
std_predicted = []
for filename in file_names:

    df_clear = pd.read_csv(file_path_grid+filename)
    df_clear = df_clear.fillna(0)
    
    #将分类变量、数值变量名称分隔开，便于后续数据处理
    col_categorical = ['Fertilizer type', 'Nitrogen placement', 'Soil tillage practices', 'Crop type']
    col_numerical = list(set(df_clear.columns)-set(col_categorical))
    
    #现将分类变量进行热码处理，然后再拼接上数值型变量，顺序不能变
    df1 = pd.get_dummies(df_clear[col_categorical]).astype(np.int64)
    df1[col_numerical] = df_clear[col_numerical]
    
    #处理施肥次数作为肥料管理策略实施的依据    
    df1['Fertilizer application time'] = [1 if x >= 2 else 0 for x in df1['Fertilizer application time']]
    
    #简化变量名称
    new_names = {'Fertilizer type_AN': 'Ftype(AN)',
                 'Fertilizer type_EEF': 'Ftype(EEF)',             
                 'Fertilizer type_Manure': 'Ftype(Manure)',
                 'Fertilizer type_Others': 'Ftype(Others)',
                 'Fertilizer type_U': 'Ftype(U)',
                 'Nitrogen placement_DPM': 'NP(DPM)',
                 'Nitrogen placement_Mix': 'NP(Mix)',
                 'Nitrogen placement_SBC': 'NP(SBC)',
                 'Soil tillage practices_CT': 'STP(CT)',
                 'Soil tillage practices_NT': 'STP(NT)',
                 'Crop type_Maize': 'Ctype(Maize)',
                 'Crop type_Rice': 'Ctype(Rice)',
                 'Crop type_Wheat': 'Ctype(Wheat)',
                 'Fertilizer application time': 'FAT',
                 'CEC': 'CEC',
                 'Clay': 'Clay',
                 'pH': 'pH',
                 'TN': 'TN',
                 'BD': 'BD',
                 'SOC': 'SOC',
                 'Water input': 'Water input',
                 'Tem': 'Tem'}
    
    df1.rename(columns=new_names, inplace=True)
    
    # 加载训练集里的变量排列顺序
    df1 = df1.loc[:, col_list]
    
    # 加载模型并预测
    EFs_pred = RF1.predict(df1)
    
    # 加载BOX_COX变换对象并将预测结果进行Box-Cox逆变换
    EF_pred = pt.inverse_transform(EFs_pred.reshape(-1, 1)).ravel()
    
    # 储存结果
    grid_num.append(int(filename.rsplit('_',1)[1].rsplit('.',1)[0]))
    EF_predicted.append(round(EF_pred.mean(),2))
    std_predicted.append(round(EF_pred.std(),2))

result_predicted = pd.DataFrame({'grid_num':grid_num, 'EF_predicted': EF_predicted, 'std_predicted': std_predicted}).sort_values(by='grid_num', ascending=True).reset_index(drop=True)
result_predicted.to_csv(file_path+f"/EF_prediction/EF_predicted({xlsx_file.rsplit('.',1)[0]}).csv", index=False)

# 记录结束时间
end_time = time.time()
# 计算耗时
elapsed_time = end_time - start_time
print(f"耗时: {elapsed_time/60:.2f} 分")

耗时: 6.23 分
