In [20]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import KFold,StratifiedKFold
import time
from tqdm.notebook import tqdm
import os

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',800)
pd.set_option('display.max_columns',400)

In [21]:
df_train = pd.read_csv(r'data/环境空气质量评价挑战赛_复赛数据集/train.csv')
df_test = pd.read_csv(r'data/环境空气质量评价挑战赛_复赛数据集/test.csv')
df = pd.concat([df_train, df_test]).reset_index(drop=True)

result_path = './sub'
if not os.path.exists(result_path): # 如果不存在则创建目录
    os.makedirs(result_path)

## 特征工程

### IAQI计算

In [22]:
def IAQI_cal(x,xbins):
    if x<xbins[1]:
        IAQI = (x-xbins[0])/(xbins[1]-xbins[0])*50
    elif xbins[1]<=x<xbins[2]:
        IAQI = (x-xbins[1])/(xbins[2]-xbins[1])*50 + 50
    elif xbins[2]<=x<xbins[3]:
        IAQI = (x-xbins[2])/(xbins[3]-xbins[2])*50 + 100
    elif xbins[3]<=x<xbins[4]:
        IAQI = (x-xbins[3])/(xbins[4]-xbins[3])*50 + 150
    elif xbins[4]<=x<xbins[5]:
        IAQI = (x-xbins[4])/(xbins[5]-xbins[4])*100 + 200
    elif xbins[5]<=x<xbins[6]:
        IAQI = (x-xbins[5])/(xbins[6]-xbins[5])*100 + 300
    elif xbins[6]<=x<xbins[7]:
        IAQI = (x-xbins[6])/(xbins[7]-xbins[6])*100 + 400
    elif x>=xbins[7]:
        IAQI = 500
    else:
        print(x,'error')
        IAQI=99999
    IAQI = np.ceil(IAQI)
    return IAQI

# IAQI_cal(243,[0,35,75,115,150,250,350,500])

PM2 = [0,35,75,115,150,250,350,500]
PM10 = [0,50,150,250,350,420,500,600]
SO2 = [0,50,150,475,800,1600,2100,2620]
CO = [0,2,4,14,24,36,48,60]
NO2 = [0,40,80,180,280,565,750,940]
O3 = [0,100,160,215,265,800,10000,10000]
col_bin = ['PM2', 'PM10', 'SO2', 'CO', 'NO2', 'O3']
col_IAQI = []

# 计算每个指标的IAQI值
for col,xbin in zip(['PM2_5', 'PM10', 'SO2', 'CO', 'NO2', 'O3'],col_bin):
    df[f'{col}_I'] = df[col].apply(lambda x: IAQI_cal(x,eval(xbin)))
    col_IAQI.append(f'{col}_I')
df['IAQI'] = df[col_IAQI].max(axis=1)

cols_ = []
df['NO2flag'] = df.apply(lambda x: 1 if x['NO2_I']==x['IAQI'] else 0,axis=1)
for col in col_IAQI:
    df[f'{col}-'] = df[col]/np.power(df['IAQI'],1.1)#1.1
    cols_.append(f'{col}-')

### 质量等级分解

In [23]:
df.loc[(df['IAQI']<=50),'质量等级'] = 1
df.loc[(50<df['IAQI'])&(df['IAQI']<=100),'质量等级'] = 2
df.loc[(100<df['IAQI'])&(df['IAQI']<=150),'质量等级'] = 3
df.loc[(150<df['IAQI'])&(df['IAQI']<=200),'质量等级'] = 4
df.loc[(200<df['IAQI'])&(df['IAQI']<=300),'质量等级'] = 5
df.loc[(300<df['IAQI'])&(df['IAQI']<=400),'质量等级'] = 6


df['label'] = df['AQI'] - df['IAQI']
df[f'PM-'] = df['PM10_I']-df['PM2_5_I']

### 特征筛选

In [24]:
df_train = df[~df['AQI'].isnull()]
df_test = df[df['AQI'].isnull()].reset_index(drop=True)

feats = [x for x in df.columns if x not in 
         ['date', 'AQI', 'label','IPRC','PM-',
          'PM2_5_I', 'PM10_I', 'SO2_I', 'CO_I', 'NO2_I', 'O3_I',
          'PM2_5', 'PM10', 'SO2', 'NO2', 'CO', 'O3','SO2_I-', 'CO_I-',
         'PM10_mean','Levels', 'new_label', 'IAQI_sum/AQI', 'w', 'b','NO2flag']]

print(feats)


['IAQI', 'PM2_5_I-', 'PM10_I-', 'NO2_I-', 'O3_I-', '质量等级']


## model

In [25]:
def build_model_lgb(trn_x, trn_y, val_x, val_y):
    model = LGBMRegressor(learning_rate=0.701398375, #0.10361,
                            boosting_type='gbdt',
                            n_estimators=10000,
                            objective='mse',
                            subsample=0.7, 
                            colsample_bytree=0.5,
                            num_leaves=352,#82,
                            reg_lambda=4,#0.6,
                            n_jobs=-1,
                          random_state=2020)
    
    model.fit(trn_x, trn_y,
                eval_set=[(val_x, val_y)],
                eval_metric='mse',
                early_stopping_rounds=1000,
                verbose=False)
    
    # model.best_ntree_limit在early_stopping_rounds中生成
    trn_pred  = model.predict(trn_x)#.reshape(-1,1) 
    val_pred  = model.predict(val_x)#.reshape(-1,1) 
    
    return trn_pred, val_pred, model

Val_rmses = []
Trn_rmses = []
R2s = []
pred_y = 0
importance = 0
feat_importance = pd.DataFrame()

fold_num = 5
skf = KFold(n_splits=fold_num, shuffle=True, random_state=2021)

for i, (trn_idx, val_idx) in tqdm(enumerate(skf.split(df_train[feats], df_train['label']))):
    trn_x, trn_y = df_train.loc[trn_idx,feats].reset_index(drop=True), df_train.loc[trn_idx,'label'].reset_index(drop=True)
    val_x, val_y = df_train.loc[val_idx,feats].reset_index(drop=True), df_train.loc[val_idx,'label'].reset_index(drop=True)
    trn_pred, val_pred, model = build_model_lgb(trn_x, trn_y, val_x, val_y)
    
    # 真实标签
    trn_pred_label = trn_pred + df_train.loc[trn_idx,'IAQI']
    val_pred_label = val_pred + df_train.loc[val_idx,'IAQI']
    Trn_rmse = np.power(mean_squared_error(df_train.loc[trn_idx,'AQI'], trn_pred_label), 0.5)
    Val_rmse = np.power(mean_squared_error(df_train.loc[val_idx,'AQI'], val_pred_label), 0.5)
    Val_rmses.append(Val_rmse)
    Trn_rmses.append(Trn_rmse)
    R_2 = r2_score(df_train.loc[val_idx,'label'],val_pred)
    df_train.loc[val_idx,'pred'] = val_pred_label
    R2s.append(round(R_2,2))
    # 重要度
    importance += model.booster_.feature_importance(importance_type='gain')/5

    
    pred_y += model.predict(df_test[feats])/fold_num

Trn_rmse_mean = np.mean(Trn_rmses)
Valr_rmse_mean = np.mean(Val_rmses)
np.mean(R2s)
# print('MSE:',np.round(Trn_rmses,fold_num),'\n',np.round(Val_rmses,fold_num))
print('Trn_rmse: %.3f'% Trn_rmse_mean,' Val_rmse: %.3f' % Valr_rmse_mean)
np.power(mean_squared_error(df_train['AQI'], df_train['pred']), 0.5)

0it [00:00, ?it/s]

0.434

Trn_rmse: 2.014  Val_rmse: 2.614


2.90927707429499

In [26]:
df_train['pred'] = df_train.apply(lambda x: 48.615*x['PM10_I-']-34.869+x['IAQI'] if x['IAQI']<=40
                                  else 55.613*x['PM10_I-']-39.112+x['IAQI'] if 40<=x['IAQI']<=45
                                  else x['pred'],axis=1)
score = np.power(mean_squared_error(df_train['AQI'], df_train['pred']), 0.5)
score

2.853745366660779

In [27]:
# 计算IPRC
df_test['AQI'] = pred_y + df_test['IAQI']
# 校正1
df_test['AQI'] = df_test.apply(lambda x: 48.615*x['PM10_I-']-34.869+x['IAQI'] if x['IAQI']<=40
                                  else 55.613*x['PM10_I-']-39.112+x['IAQI'] if 40<=x['IAQI']<=45
                                  else x['AQI'],axis=1)
## 校正2
df_test['AQI'] = df_test.apply(lambda x: x['PM10_I']if (x['NO2flag']==1)&(x['IAQI']>50)
                               else x['AQI'] ,axis=1)
## 校正3
df_test['AQI'] = df_test.apply(lambda x:0.5*x['PM-'] - 24.5+x['IAQI'] if x['PM-']>75
                               else x['PM-']-69+x['IAQI'] if (x['PM-']>=70)&(x['PM-']<75)
                               else x['AQI'] ,axis=1)

In [28]:
sub = pd.read_csv('./sub/IPRC_lgb_Trn159_Val177.csv')
sub['AQI'] = df_test['AQI']
sub.to_csv(f'./sub/sub.csv',index=False)