In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import os
from utils import *
from settings import station_locale, data_root, output_folder, report_folder, station_dist, station_angle

from sklearn.externals import joblib
from sklearn.model_selection import train_test_split 

from xgboost import XGBRegressor

In [2]:
data_folder = '../outputs/'
report_folder = '../report_rc/'
model_folder = '../models/'

In [3]:
df = pd.read_csv(os.path.join(data_folder, 'train_set.zip'))
df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [4]:
df = df.query('datetime_label>5000')
df['periodic_solar_term'] = df['date'].map(lambda x: get_solar_term(x, periodicity=True))
df['periodic_time'] = df['time'].map(lambda x: get_periodic_time_label(x))

In [5]:
# x,y = construct_train_xy(df, 'aotizhongxin_aq', df['station_id'].unique())

In [6]:
# x.head()

In [7]:
station_list = df['station_id'].unique()

for station_id in station_list:
    
    x, y = construct_train_xy(df, station_id, station_list, only_x=False)
    
    xgb_pm25_1 = XGBRegressor(booster='gblinear', n_estimators=1000, learning_rate=0.1, max_depth=5).fit(x, y[['PM2.5']])
    xgb_pm25_3 = XGBRegressor(booster='gblinear', n_estimators=1500, learning_rate=0.1, max_depth=3).fit(x, y[['PM2.5']])
    xgb_pm25_2 = XGBRegressor(booster='gblinear', n_estimators=2000, learning_rate=0.1, max_depth=1).fit(x, y[['PM2.5']])
    
    xgb_pm10_1 = XGBRegressor(booster='gblinear', n_estimators=1000, learning_rate=0.1, max_depth=5).fit(x, y[['PM10']])
    
    xgb_o3_2 = XGBRegressor(booster='gblinear', n_estimators=1500, learning_rate=0.1, max_depth=3).fit(x, y[['O3']])
    xgb_o3_3 = XGBRegressor(booster='gblinear', n_estimators=2000, learning_rate=0.1, max_depth=1).fit(x, y[['O3']])
    
    station_name = station_id.split('_')[0]
    joblib.dump(xgb_pm25_1, os.path.join(model_folder, f'{station_name}_pm25_1.joblib'))
    joblib.dump(xgb_pm25_2, os.path.join(model_folder, f'{station_name}_pm25_2.joblib'))
    joblib.dump(xgb_pm25_3, os.path.join(model_folder, f'{station_name}_pm25_3.joblib'))
    joblib.dump(xgb_pm10_1, os.path.join(model_folder, f'{station_name}_pm10_1.joblib'))
#     joblib.dump(xgb_pm10_2, os.path.join(model_folder, f'{station_name}_pm10_2.joblib'))
#     joblib.dump(xgb_pm10_3, os.path.join(model_folder, f'{station_name}_pm10_3.joblib'))
#     joblib.dump(xgb_o3_1, os.path.join(model_folder, f'{station_name}_o3_1.joblib'))
    joblib.dump(xgb_o3_2, os.path.join(model_folder, f'{station_name}_o3_2.joblib'))
    joblib.dump(xgb_o3_3, os.path.join(model_folder, f'{station_name}_o3_3.joblib'))
    

In [None]:
station_list = df['station_id'].unique()

for station_id in station_list:
    
    x, y = construct_train_xy(df, station_id, station_list, only_x=False)
    rg_o3 = Ridge(alpha=0.01).fit(x, y[['O3']])