# 毕业论文`GRAD`
## 建立多元线性回归模型，定量评估影响

*`Evan`*\
*`2023-11-16`*
---

In [1]:
import numpy as np
import pandas as pd

import sys
sys.path.append('../../src/')
from namelist import *
from RandomForest import read_data, rf_importance

# silence the warning note
import warnings
warnings.filterwarnings("ignore")

In [2]:
lowyears  = [2014,2015,2016]
highyears = [2019,2021,2022]
month     = 'Sep'
regions   = ['Zhaoqing','Huizhou','Guangzhou','Foshan',
             'Dongguan','Shenzhen','Zhongshan','Jiangmen',
             'Zhuhai']
datapath  = datadir + 'Contribution/data/'

# 设置变量列表和目标变量
variants = ['SFC_TMP', 'SOL_RAD', 'RH', 'PRES', 
            'WSPD10', 'WDIR10','CloudFRAC',
            'NO2', 'VOC', 'PM25', 'ISOP'] # 'PBLH',
target = 'O3'

## 读取数据，建立模型

In [4]:
for region in regions:
    print(f'{region}\n')
    
    print('Reading data')
    df_low  = read_data(lowyears,month,region,datapath)
    df_high = read_data(highyears,month,region,datapath)

    print('Random Forest Modeling ...\n')
    low_importance = rf_importance(df_low,variants,target)
    print('\n===============================================\n')
    high_importance = rf_importance(df_high,variants,target)

    # 合并两组结果，对列重命名
    dfoutput = pd.concat([low_importance,high_importance],axis=1)
    dfoutput.columns=['Low','High']

    filepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
    filename = f'RF_{region}.xlsx'
    dfoutput.to_excel(filepath + filename,index=True)

Zhaoqing

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 231.65407992846542
R-squared (Random Forest): 0.8367354034736854
NO2: 0.38273924839637763
PM25: 0.12951627298480325
SFC_TMP: 0.09353119526174329
SOL_RAD: 0.08353486424913024
WDIR10: 0.07454623933482572
VOC: 0.05480517593642803
ISOP: 0.053126735184855334
RH: 0.03770740265859169
CloudFRAC: 0.03293074073212307
WSPD10: 0.03027412427271536
PRES: 0.02728800098840617


Mean Squared Error (Random Forest): 144.70899032863883
R-squared (Random Forest): 0.8844286065968553
NO2: 0.3273241739261955
RH: 0.15391974944872938
SOL_RAD: 0.114671300804807
SFC_TMP: 0.11411707339745814
PM25: 0.08561149433300759
VOC: 0.049401493110048184
WDIR10: 0.04048387329831366
PRES: 0.03571864733942737
CloudFRAC: 0.033646020196897865
WSPD10: 0.02318244303643256
ISOP: 0.021923731108682943
Huizhou

Reading data
Random Forest Modeling ...

Mean Squared Error (Random Forest): 131.26586443357976
R-squared (Random Forest): 0.9111830855245912

## 读取输出数据，整合到一张表中

In [3]:
infilepath = 'D:/data/Graduation/Contribution/RandomForest_output/TMP/'
outfilepath = 'D:/data/Graduation/Contribution/RandomForest_output/'
dfs = [infilepath + f'RF_{region}.xlsx' for region in regions]

writer = pd.ExcelWriter(outfilepath + 'RF_cities.xlsx', engine='xlsxwriter')
for region in regions:
    df = pd.read_excel(infilepath + f'RF_{region}.xlsx',index_col=0)
    sheet_name = region
    df.to_excel(writer, sheet_name=sheet_name,index=True)
writer.close()