In [1]:
'''
Author: jyyd23@mails.tsinghua.edu.cn
Date: 2024-04-28 11:51:04
LastEditors: jyyd23@mails.tsinghua.edu.cn
LastEditTime: 2024-05-09 16:35:29
FilePath: CAMS_downscale\cams_train_pred.ipynb
'''

import pandas as pd
import numpy as np
import netCDF4 as nc
import scipy.io as sio
import time
import dataframe_image as dfi
import os, sys
os.chdir(sys.path[0])
from tqdm import trange, tqdm
import seaborn as sns

import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn import neighbors, svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor

from joblib import dump, load
import concurrent.futures
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
from sklearn.model_selection import GridSearchCV, cross_val_score,cross_val_predict,cross_validate, RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")
sns.set(rc={'figure.dpi': 600})

evsall = []
maeall = []
mseall = []
r2all = []


def pred_plot(y_test, y_pred, resid):
    print('----------------------------------')
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    print('MSE: ', mse)
    print('MAE: ', mae)
    print('r2 score: ', r2)
    print('Explained_variance: ', evs)
    return mse,mae,r2,evs
def predpnc(model_fit, x_test, y_test, pncdata2020):
    pred = model_fit.predict(x_test)
    resid = pred - y_test
    mse, mae, r2, evs = pred_plot(y_test, pred, resid)
    pred = pd.DataFrame(pred)
    resid = pd.DataFrame(resid)
    data2020_pred = pd.concat([pncdata2020, pred, resid], axis=1)
    return data2020_pred

In [5]:
pollution = 'PM2.5'

trainpath = '../dataset/trainpredata/trainData2312/'+ pollution + '_trainData.csv'
traindata = pd.read_csv(trainpath)
stations = ['BAS', 'BER', 'HAE', 'LUG', 'RIG', 'LAU', 'ZUE', 'DUE', 'SIO', 'MAG',
            'PAY', 'TAN', 'CHA', 'DAV', 'JUN']
sta = []
for i in range(len(stations)):
    temp = [stations[i] for m in range(8760+8784)]
    sta.extend(temp)
sta = pd.DataFrame(sta)
# PM2.5
traindata = pd.concat([traindata, sta], axis=1)
traindata.columns = ['cams', 'radiation', 'temperature', 'precipitation', 'humidity',
                       'Speed', 'road', 'hour', 'month', 'weekday', 'measurements', 'sta']
# OTHERS
traindata = traindata.apply(pd.to_numeric,errors="ignore")
traindata = traindata[['sta', 'cams', 'radiation', 'temperature', 'precipitation', 'humidity',
                       'Speed', 'road', 'hour', 'month', 'weekday', 'measurements']]
slices_train = []
slices_test = []
for i in range(16):
    # print('----------------------------------')
    # print('slice:', i)
    start_train = i * (8760 + 8784)
    end_train = start_train + 8760
    # print(start_train, end_train)
    slices_train.append(traindata.iloc[start_train:end_train, :])
    start_test = 8760 + i * (8760 + 8784)
    end_test = start_test + 8784
    # print(start_test, end_test)
    slices_test.append(traindata.iloc[start_test:end_test, :])
traindata_train = pd.concat(slices_train)
traindata_test = pd.concat(slices_test)

traindata = traindata.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how='any')
traindata_train = traindata_train.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how='any')
traindata_test = traindata_test.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how='any')
traindata_train = traindata_train.reset_index(drop=True)
traindata_test = traindata_test.reset_index(drop=True)

scaler = StandardScaler().fit(traindata.iloc[:, 1:-1])
x_train = scaler.transform(traindata_train.iloc[:, 1:-1])
y_train = traindata_train.iloc[:, -1]
x_test = scaler.transform(traindata_test.iloc[:, 1:-1])
y_test = traindata_test.iloc[:, -1]

print('traindata_train:', traindata_train.shape)
print('traindata_test:', traindata_test.shape)
print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_test:', x_test.shape)
print('y_test:', y_test.shape)

traindata_train: (105024, 12)
traindata_test: (113994, 12)
x_train: (105024, 10)
y_train: (105024,)
x_test: (113994, 10)
y_test: (113994,)


In [3]:
gbr_reg = GradientBoostingRegressor(n_estimators=100, random_state=42).fit(x_train, y_train)
lgb_reg = lgb.LGBMRegressor(random_state=42, n_estimators=100, n_jobs=20).fit(x_train, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1589
[LightGBM] [Info] Number of data points in the train set: 105024, number of used features: 10
[LightGBM] [Info] Start training from score 1.050845


In [1]:
data2020_pred8 = predpnc(gbr_reg, x_test, y_test, traindata_test)
data2020_pred9 = predpnc(lgb_reg, x_test, y_test, data2020_pred8)
data2020_pred9.columns = ['sta', 'cams', 'radiation', 'temperature', 'precipitation',
                          'humidity', 'Speed', 'road', 'hour', 'month', 'weekday',
                          'measurements', 'gbr_pred','gbr_resid',
                          'lgb_pred','lgb_resid']
pred_table_path = '../out/pred_table/'
# data2020_pred9.to_csv(pred_table_path + pollution + '_data2020_pred.csv')