# 毕业论文`GRAD`
## 建立多元线性回归模型，定量评估影响

*`Evan`*\
*`2023-11-16`*
---

In [1]:
import numpy as np
import pandas as pd

import sys
sys.path.append('../../src/')
from namelist import *
from RandomForest import read_data, rf_importance

# silence the warning note
import warnings
warnings.filterwarnings("ignore")

In [2]:
lowyears  = [2014,2015,2016]
highyears = [2019,2021,2022]
month     = 'Sep'
regions   = ['Zhaoqing','Huizhou','Guangzhou','Foshan',
             'Dongguan','Shenzhen','Zhongshan','Jiangmen',
             'Zhuhai']
datapath  = datadir + 'Contribution/data/'

# 设置变量列表和目标变量
variants = ['SFC_TMP', 'SOL_RAD', 'RH', 'PRES', 
            'WSPD10', 'WDIR10','CloudFRAC',
            'NO2', 'VOC', 'PM25', 'ISOP'] # 'PBLH',
target = 'O3'

# 白天

In [3]:
for region in regions:
    print(f'{region}\n')
    
    print('Reading data')
    df_low  = read_data(lowyears,month,region,datapath)
    df_high = read_data(highyears,month,region,datapath)

    diurnal_low  = df_low.between_time('8:00','17:00')
    diurnal_high = df_high.between_time('8:00','17:00')

    print('Random Forest Modeling ...\n')
    low_importance = rf_importance(diurnal_low,variants,target)
    print('\n===============================================\n')
    high_importance = rf_importance(diurnal_high,variants,target)

    # 合并两组结果，对列重命名
    dfoutput = pd.concat([low_importance,high_importance],axis=1)
    dfoutput.columns=['Low','High']

    filepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
    filename = f'RF_{region}_diurnal.xlsx'
    dfoutput.to_excel(filepath + filename,index=True)

Zhaoqing

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 191.87992302352495
R-squared (Random Forest): 0.7877584618444918
RH: 0.25495126394671774
WDIR10: 0.17597563172790126
PM25: 0.14021215649415622
SFC_TMP: 0.09269746994067113
VOC: 0.08348418676081747
NO2: 0.07489029596624988
SOL_RAD: 0.04789310517050026
PRES: 0.036092380185622934
WSPD10: 0.03362988689011021
ISOP: 0.03128793951279243
CloudFRAC: 0.028885683404460447


Mean Squared Error (Random Forest): 129.3913089605949
R-squared (Random Forest): 0.8565836875824455
RH: 0.3001139911251374
PM25: 0.18983162368490653
SFC_TMP: 0.11786766804313391
WDIR10: 0.09746658297826823
VOC: 0.06786967561662535
NO2: 0.06015653344162816
SOL_RAD: 0.04136665631875862
PRES: 0.0357834770590364
WSPD10: 0.03327177993459639
CloudFRAC: 0.02999978311098834
ISOP: 0.026272228686920692
Huizhou

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 166.93737922586718
R-squared (Random Forest): 0.8328922840352717


读取输出数据，整合到一张表中

In [4]:
infilepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
outfilepath = 'D:/data/Graduation/Contribution/RandomForest_output/'

writer = pd.ExcelWriter(outfilepath + 'RF_cities_diurnal.xlsx', engine='xlsxwriter')
for region in regions:
    df = pd.read_excel(infilepath + f'RF_{region}_diurnal.xlsx',index_col=0)
    sheet_name = region
    df.to_excel(writer, sheet_name=sheet_name,index=True)
writer.close()

# 夜间

In [5]:
for region in regions:
    print(f'{region}\n')
    
    print('Reading data')
    df_low  = read_data(lowyears,month,region,datapath)
    df_high = read_data(highyears,month,region,datapath)

    nocturnal_low  = df_low.between_time('18:00','7:00')
    nocturnal_high = df_high.between_time('18:00','7:00')

    print('Random Forest Modeling ...\n')
    low_importance = rf_importance(nocturnal_low,variants,target)
    print('\n===============================================\n')
    high_importance = rf_importance(nocturnal_high,variants,target)

    # 合并两组结果，对列重命名
    dfoutput = pd.concat([low_importance,high_importance],axis=1)
    dfoutput.columns=['Low','High']

    filepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
    filename = f'RF_{region}_nocturnal.xlsx'
    dfoutput.to_excel(filepath + filename,index=True)

Zhaoqing

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 184.85562636240488
R-squared (Random Forest): 0.7835636879152241
PM25: 0.2876015697036314
SFC_TMP: 0.11010365410529155
WSPD10: 0.08896379641006422
ISOP: 0.0885992840179367
NO2: 0.08418715247068669
VOC: 0.07887972144977701
CloudFRAC: 0.07369099575445716
WDIR10: 0.06121407483858166
RH: 0.058494911680546866
PRES: 0.04573833980504829
SOL_RAD: 0.022526499763978584


Mean Squared Error (Random Forest): 161.4181258964532
R-squared (Random Forest): 0.8274386856389494
RH: 0.2690735476336382
NO2: 0.16390782308201285
SFC_TMP: 0.12819927939626913
PM25: 0.08414814516781505
PRES: 0.07786879732477722
VOC: 0.05880300680562945
WSPD10: 0.05779465302971772
CloudFRAC: 0.05327715955106732
WDIR10: 0.05060910477359901
ISOP: 0.0403242884049565
SOL_RAD: 0.01599419483051737
Huizhou

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 132.73412297196484
R-squared (Random Forest): 0.8524999216897543
PM2

In [6]:
infilepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
outfilepath = 'D:/data/Graduation/Contribution/RandomForest_output/'

writer = pd.ExcelWriter(outfilepath + 'RF_cities_nocturnal.xlsx', engine='xlsxwriter')
for region in regions:
    df = pd.read_excel(infilepath + f'RF_{region}_nocturnal.xlsx',index_col=0)
    sheet_name = region
    df.to_excel(writer, sheet_name=sheet_name,index=True)
writer.close()