In [1]:
# using pandas to process data
from collections import Counter
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline
from scipy.stats import pearsonr

# 1. 拼接grid_weather数据

In [2]:
# Step1:load data from .CSV and combine them together, 以日期作为索引
# Step1.1:load 'gridWeather_201701-201803.csv'
gridWeather_1 = pd.read_csv('data/gridWeather_201701-201803.csv', parse_dates=['utc_time'], index_col=False)
gridWeather_1.columns = ['stationName', 'longitude', 'latitude', 'utc_time', 'temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed']
del gridWeather_1['longitude']
del gridWeather_1['latitude']
gridWeather_1.set_index(['stationName', 'utc_time'], inplace=True)

# Step1.2:load 'gridWeather_201804.csv'
gridWeather_2 = pd.read_csv('data/gridWeather_201804.csv', parse_dates=['time'], index_col=False)
gridWeather_2.columns = ['id', 'stationName', 'utc_time','weather', 'temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed']
del gridWeather_2['id']
del gridWeather_2['weather']
gridWeather_2.set_index(['stationName', 'utc_time'], inplace=True)

# Step1.3:concat them together
gridWeather = pd.concat([gridWeather_1, gridWeather_2])
gridWeather.to_csv('data/gridWeather.csv')

In [3]:
gridWeather = pd.read_csv('data/gridWeather.csv')
gridWeather.head()

Unnamed: 0,stationName,utc_time,temperature,pressure,humidity,wind_direction,wind_speed
0,beijing_grid_000,2017-01-01 00:00:00,-5.47,984.73,76.6,53.71,3.53
1,beijing_grid_001,2017-01-01 00:00:00,-5.53,979.33,75.4,43.59,3.11
2,beijing_grid_002,2017-01-01 00:00:00,-5.7,963.14,71.8,0.97,2.75
3,beijing_grid_003,2017-01-01 00:00:00,-5.88,946.94,68.2,327.65,3.84
4,beijing_grid_004,2017-01-01 00:00:00,-5.34,928.8,58.81,317.85,6.14


In [4]:
near_stations = {'aotizhongxin_aq': 'beijing_grid_304',
 'badaling_aq': 'beijing_grid_224',
 'beibuxinqu_aq': 'beijing_grid_263',
 'daxing_aq': 'beijing_grid_301',
 'dingling_aq': 'beijing_grid_265',
 'donggaocun_aq': 'beijing_grid_452',
 'dongsi_aq': 'beijing_grid_303',
 'dongsihuan_aq': 'beijing_grid_324',
 'fangshan_aq': 'beijing_grid_238',
 'fengtaihuayuan_aq': 'beijing_grid_282',
 'guanyuan_aq': 'beijing_grid_282',
 'gucheng_aq': 'beijing_grid_261',
 'huairou_aq': 'beijing_grid_349',
 'liulihe_aq': 'beijing_grid_216',
 'mentougou_aq': 'beijing_grid_240',
 'miyun_aq': 'beijing_grid_392',
 'miyunshuiku_aq': 'beijing_grid_414',
 'nansanhuan_aq': 'beijing_grid_303',
 'nongzhanguan_aq': 'beijing_grid_324',
 'pingchang_aq': 'beijing_grid_264',
 'pinggu_aq': 'beijing_grid_452',
 'qianmen_aq': 'beijing_grid_303',
 'shunyi_aq': 'beijing_grid_368',
 'tiantan_aq': 'beijing_grid_303',
 'tongzhou_aq': 'beijing_grid_366',
 'wanliu_aq': 'beijing_grid_283',
 'wanshouxigong_aq': 'beijing_grid_303',
 'xizhimenbei_aq': 'beijing_grid_283',
 'yanqin_aq': 'beijing_grid_225',
 'yizhuang_aq': 'beijing_grid_323',
 'yongdingmennei_aq': 'beijing_grid_303',
 'yongledian_aq': 'beijing_grid_385',
 'yufa_aq': 'beijing_grid_278',
 'yungang_aq': 'beijing_grid_239',
 'zhiwuyuan_aq': 'beijing_grid_262'}

In [5]:
def load_grid_meo_data(meo_df, useful_stations):
    '''
    useful_stations : dict of {aq_station : meo_station}
    '''

    meo_dataset = meo_df

    # turn date from string type to datetime type
    meo_dataset["time"] = pd.to_datetime(meo_dataset['utc_time'])
    meo_dataset.set_index("time", inplace=True)
    meo_dataset.drop("utc_time", axis=1, inplace=True)

    # names of all stations
    stations = set(meo_dataset['stationName'])

    # a dict of station aq
    meo_stations = {}

    for aq_station_name, meo_station_name in useful_stations.items() :

        if meo_station_name in stations :
            meo_station = meo_dataset[meo_dataset["stationName"]==meo_station_name].copy()
            meo_station.drop("stationName", axis=1, inplace=True)
            if "None" in meo_station.columns :
                meo_station.drop("None", axis=1, inplace=True)

            # rename
            original_names = meo_station.columns.values.tolist()
            names_dict = {original_name : aq_station_name+"_"+original_name for original_name in original_names}
            meo_station_renamed = meo_station.rename(index=str, columns=names_dict)
            
            meo_stations[aq_station_name] = meo_station_renamed        


    return meo_dataset, stations, meo_stations

In [6]:
bj_grid_meo_dataset, stations, bj_meo_stations = load_grid_meo_data(gridWeather, near_stations)

In [7]:
bj_grid_meo_dataset.head() #gridWeather

Unnamed: 0_level_0,stationName,temperature,pressure,humidity,wind_direction,wind_speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-01,beijing_grid_000,-5.47,984.73,76.6,53.71,3.53
2017-01-01,beijing_grid_001,-5.53,979.33,75.4,43.59,3.11
2017-01-01,beijing_grid_002,-5.7,963.14,71.8,0.97,2.75
2017-01-01,beijing_grid_003,-5.88,946.94,68.2,327.65,3.84
2017-01-01,beijing_grid_004,-5.34,928.8,58.81,317.85,6.14


In [8]:
df = bj_meo_stations['dingling_aq']
min_time = df.index.min()
max_time = df.index.max()
    
min_time = datetime.datetime.strptime(min_time, '%Y-%m-%d %H:%M:%S')
max_time = datetime.datetime.strptime(max_time, '%Y-%m-%d %H:%M:%S')
delta_all = max_time-min_time

print("最早的日期：", df.index.min())
print("最晚的日期：", df.index.max())
print("在网格数据数据时间段内，总共应该有 %d 个小时节点。" %(delta_all.total_seconds()/3600 + 1))

最早的日期： 2017-01-01 00:00:00
最晚的日期： 2018-04-30 23:00:00
在网格数据数据时间段内，总共应该有 11640 个小时节点。


In [9]:
bj_meo_stations['dingling_aq']
# 长度为35的dict
# key.value是一个df, df.shape=(11518*5)

Unnamed: 0_level_0,dingling_aq_temperature,dingling_aq_pressure,dingling_aq_humidity,dingling_aq_wind_direction,dingling_aq_wind_speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 00:00:00,-6.52,978.4600,75.38,320.99,4.29
2017-01-01 01:00:00,-3.57,978.3500,62.43,316.58,4.49
2017-01-01 02:00:00,-0.62,978.2400,49.47,312.57,4.71
2017-01-01 03:00:00,2.33,978.1400,36.52,308.94,4.96
2017-01-01 04:00:00,3.40,977.5700,33.31,297.12,3.66
2017-01-01 05:00:00,4.46,977.0100,30.09,274.69,2.66
2017-01-01 06:00:00,5.52,976.4500,26.88,239.04,2.39
2017-01-01 07:00:00,2.63,976.4500,37.79,245.22,2.43
2017-01-01 08:00:00,-0.26,976.4500,48.71,251.15,2.49
2017-01-01 09:00:00,-3.15,976.4500,59.62,256.73,2.58


In [10]:
stations # 长度为651的dict

{'beijing_grid_000',
 'beijing_grid_001',
 'beijing_grid_002',
 'beijing_grid_003',
 'beijing_grid_004',
 'beijing_grid_005',
 'beijing_grid_006',
 'beijing_grid_007',
 'beijing_grid_008',
 'beijing_grid_009',
 'beijing_grid_010',
 'beijing_grid_011',
 'beijing_grid_012',
 'beijing_grid_013',
 'beijing_grid_014',
 'beijing_grid_015',
 'beijing_grid_016',
 'beijing_grid_017',
 'beijing_grid_018',
 'beijing_grid_019',
 'beijing_grid_020',
 'beijing_grid_021',
 'beijing_grid_022',
 'beijing_grid_023',
 'beijing_grid_024',
 'beijing_grid_025',
 'beijing_grid_026',
 'beijing_grid_027',
 'beijing_grid_028',
 'beijing_grid_029',
 'beijing_grid_030',
 'beijing_grid_031',
 'beijing_grid_032',
 'beijing_grid_033',
 'beijing_grid_034',
 'beijing_grid_035',
 'beijing_grid_036',
 'beijing_grid_037',
 'beijing_grid_038',
 'beijing_grid_039',
 'beijing_grid_040',
 'beijing_grid_041',
 'beijing_grid_042',
 'beijing_grid_043',
 'beijing_grid_044',
 'beijing_grid_045',
 'beijing_grid_046',
 'beijing_gri

# 2. 重复值分析
识别重复值数量，并将重复值去掉

In [11]:
for station in bj_meo_stations.keys():
    df = bj_meo_stations[station].copy()
    df = df.reset_index()
    length1 = df.shape[0]
    df.drop_duplicates(subset='time', keep='first', inplace=True)
    length2 = df.shape[0]
    delta = length2-length1
    print('%s重复数量：%d'%(station, delta))
    df.set_index('time', inplace=True)
    bj_meo_stations[station] = df
# 没有重复数据

yizhuang_aq重复数量：0
donggaocun_aq重复数量：0
wanliu_aq重复数量：0
gucheng_aq重复数量：0
liulihe_aq重复数量：0
huairou_aq重复数量：0
fengtaihuayuan_aq重复数量：0
beibuxinqu_aq重复数量：0
qianmen_aq重复数量：0
yanqin_aq重复数量：0
miyunshuiku_aq重复数量：0
pingchang_aq重复数量：0
daxing_aq重复数量：0
yongledian_aq重复数量：0
aotizhongxin_aq重复数量：0
tiantan_aq重复数量：0
guanyuan_aq重复数量：0
shunyi_aq重复数量：0
miyun_aq重复数量：0
fangshan_aq重复数量：0
pinggu_aq重复数量：0
dongsihuan_aq重复数量：0
dingling_aq重复数量：0
mentougou_aq重复数量：0
wanshouxigong_aq重复数量：0
nansanhuan_aq重复数量：0
badaling_aq重复数量：0
tongzhou_aq重复数量：0
nongzhanguan_aq重复数量：0
yufa_aq重复数量：0
xizhimenbei_aq重复数量：0
yungang_aq重复数量：0
yongdingmennei_aq重复数量：0
zhiwuyuan_aq重复数量：0
dongsi_aq重复数量：0


# 3.缺失值分析
3.1 判断有没有局部缺失

In [12]:
for station in bj_meo_stations.keys():
    df = bj_meo_stations[station]
    print(station, pd.isnull(df).any().any())
# 没有局部缺失值,因此不需要使用插值处理

yizhuang_aq False
donggaocun_aq False
wanliu_aq False
gucheng_aq False
liulihe_aq False
huairou_aq False
fengtaihuayuan_aq False
beibuxinqu_aq False
qianmen_aq False
yanqin_aq False
miyunshuiku_aq False
pingchang_aq False
daxing_aq False
yongledian_aq False
aotizhongxin_aq False
tiantan_aq False
guanyuan_aq False
shunyi_aq False
miyun_aq False
fangshan_aq False
pinggu_aq False
dongsihuan_aq False
dingling_aq False
mentougou_aq False
wanshouxigong_aq False
nansanhuan_aq False
badaling_aq False
tongzhou_aq False
nongzhanguan_aq False
yufa_aq False
xizhimenbei_aq False
yungang_aq False
yongdingmennei_aq False
zhiwuyuan_aq False
dongsi_aq False


3.2 整体性缺失情况的处理：

（1）如果连续缺失时长<=5小时，就进行补全；
（2）如果超过5小时，用NAN进行补全（整张表中唯一会出现NAN的情况）

In [13]:
for station in bj_meo_stations.keys():
    df = bj_meo_stations[station].copy()
    
    min_time = df.index.min()
    max_time = df.index.max()
    
    min_time = datetime.datetime.strptime(min_time, '%Y-%m-%d %H:%M:%S')
    max_time = datetime.datetime.strptime(max_time, '%Y-%m-%d %H:%M:%S')
    delta_all = max_time-min_time
    
    all_length = delta_all.total_seconds()/3600 + 1
    real_length = df.shape[0]
    print('%s 缺失时间节点数量是 %d'%(station, all_length-real_length))

yizhuang_aq 缺失时间节点数量是 122
donggaocun_aq 缺失时间节点数量是 122
wanliu_aq 缺失时间节点数量是 122
gucheng_aq 缺失时间节点数量是 122
liulihe_aq 缺失时间节点数量是 122
huairou_aq 缺失时间节点数量是 122
fengtaihuayuan_aq 缺失时间节点数量是 122
beibuxinqu_aq 缺失时间节点数量是 122
qianmen_aq 缺失时间节点数量是 122
yanqin_aq 缺失时间节点数量是 122
miyunshuiku_aq 缺失时间节点数量是 122
pingchang_aq 缺失时间节点数量是 122
daxing_aq 缺失时间节点数量是 122
yongledian_aq 缺失时间节点数量是 122
aotizhongxin_aq 缺失时间节点数量是 123
tiantan_aq 缺失时间节点数量是 122
guanyuan_aq 缺失时间节点数量是 122
shunyi_aq 缺失时间节点数量是 122
miyun_aq 缺失时间节点数量是 122
fangshan_aq 缺失时间节点数量是 122
pinggu_aq 缺失时间节点数量是 122
dongsihuan_aq 缺失时间节点数量是 122
dingling_aq 缺失时间节点数量是 122
mentougou_aq 缺失时间节点数量是 123
wanshouxigong_aq 缺失时间节点数量是 122
nansanhuan_aq 缺失时间节点数量是 122
badaling_aq 缺失时间节点数量是 122
tongzhou_aq 缺失时间节点数量是 122
nongzhanguan_aq 缺失时间节点数量是 122
yufa_aq 缺失时间节点数量是 122
xizhimenbei_aq 缺失时间节点数量是 122
yungang_aq 缺失时间节点数量是 122
yongdingmennei_aq 缺失时间节点数量是 122
zhiwuyuan_aq 缺失时间节点数量是 122
dongsi_aq 缺失时间节点数量是 122


3.3 整体缺失补充

In [14]:
for station in bj_meo_stations.keys():
    df = bj_meo_stations[station].copy()
    print(station, df.shape)
# all of them are (11518,5)

yizhuang_aq (11518, 5)
donggaocun_aq (11518, 5)
wanliu_aq (11518, 5)
gucheng_aq (11518, 5)
liulihe_aq (11518, 5)
huairou_aq (11518, 5)
fengtaihuayuan_aq (11518, 5)
beibuxinqu_aq (11518, 5)
qianmen_aq (11518, 5)
yanqin_aq (11518, 5)
miyunshuiku_aq (11518, 5)
pingchang_aq (11518, 5)
daxing_aq (11518, 5)
yongledian_aq (11518, 5)
aotizhongxin_aq (11517, 5)
tiantan_aq (11518, 5)
guanyuan_aq (11518, 5)
shunyi_aq (11518, 5)
miyun_aq (11518, 5)
fangshan_aq (11518, 5)
pinggu_aq (11518, 5)
dongsihuan_aq (11518, 5)
dingling_aq (11518, 5)
mentougou_aq (11517, 5)
wanshouxigong_aq (11518, 5)
nansanhuan_aq (11518, 5)
badaling_aq (11518, 5)
tongzhou_aq (11518, 5)
nongzhanguan_aq (11518, 5)
yufa_aq (11518, 5)
xizhimenbei_aq (11518, 5)
yungang_aq (11518, 5)
yongdingmennei_aq (11518, 5)
zhiwuyuan_aq (11518, 5)
dongsi_aq (11518, 5)


In [15]:
delta = datetime.timedelta(hours=1)

for station in bj_meo_stations.keys():
    df = bj_meo_stations[station].copy()
    nan_series = pd.Series({key:np.nan for key in df.columns})
    
    min_time = df.index.min()
    max_time = df.index.max()
    
    min_time = datetime.datetime.strptime(min_time, '%Y-%m-%d %H:%M:%S')
    max_time = datetime.datetime.strptime(max_time, '%Y-%m-%d %H:%M:%S')
    
    time = min_time
    
    while time <= max_time:
        time_str = datetime.date.strftime(time, '%Y-%m-%d %H:%M:%S')
        if time_str not in df.index:
            
            # 寻找前面第几个是非空
            found_for = False
            i = 0
            while not found_for:
                i += 1
                for_time = time - i * delta
                for_time_str = datetime.date.strftime(for_time, '%Y-%m-%d %H:%M:%S')
                if for_time_str in df.index:
                    for_row = df.loc[for_time_str]
                    for_step = i
                    found_for = True
            
            # 寻找后面第几个是非空
            found_back = False
            j = 0
            while not found_back:
                j += 1
                back_time = time + j * delta
                back_time_str = datetime.date.strftime(back_time, '%Y-%m-%d %H:%M:%S')
                if back_time_str in df.index:
                    back_row = df.loc[back_time_str]
                    back_step = j
                    found_back = True
            
            all_steps = for_step + back_step
            
            if all_steps <=5:
                # 线性插值
                delata_values = back_row - for_row
                df.loc[time_str] = for_row + (for_step/all_steps) * delata_values
            else:
                df.loc[time_str] = nan_series
        
        time += delta
    
    bj_meo_stations[station] = df

    print('%s: length of data is %d' % (station, df.shape[0]))  

yizhuang_aq: length of data is 11640
donggaocun_aq: length of data is 11640
wanliu_aq: length of data is 11640
gucheng_aq: length of data is 11640
liulihe_aq: length of data is 11640
huairou_aq: length of data is 11640
fengtaihuayuan_aq: length of data is 11640
beibuxinqu_aq: length of data is 11640
qianmen_aq: length of data is 11640
yanqin_aq: length of data is 11640
miyunshuiku_aq: length of data is 11640
pingchang_aq: length of data is 11640
daxing_aq: length of data is 11640
yongledian_aq: length of data is 11640
aotizhongxin_aq: length of data is 11640
tiantan_aq: length of data is 11640
guanyuan_aq: length of data is 11640
shunyi_aq: length of data is 11640
miyun_aq: length of data is 11640
fangshan_aq: length of data is 11640
pinggu_aq: length of data is 11640
dongsihuan_aq: length of data is 11640
dingling_aq: length of data is 11640
mentougou_aq: length of data is 11640
wanshouxigong_aq: length of data is 11640
nansanhuan_aq: length of data is 11640
badaling_aq: length of dat

最早的日期： 2017-01-01 00:00:00
最晚的日期： 2018-04-30 23:00:00
在网格数据数据时间段内，总共应该有 11640 个小时节点。

3.4 风向缺失值处理：暂时使用0替换缺失的风向

In [16]:
for station in bj_meo_stations.keys():
    df = bj_meo_stations[station].copy()
    df.replace(999017, 0, inplace=True)
    bj_meo_stations[station] = df

3.5 拼接成整张表，并保存

In [17]:
bj_meo_stations_merged = pd.concat(list(bj_meo_stations.values()), axis=1)
bj_meo_stations_merged.shape # (11640, 175)

(11640, 175)

In [18]:
bj_meo_stations_merged.sort_index(inplace=True)
bj_meo_stations_merged.to_csv("data/bj_meo_data.csv")

# 4. 数据归一化

In [19]:
describe = bj_meo_stations_merged.describe()
describe.to_csv("data/bj_meo_describe.csv")

In [20]:
df_norm = (bj_meo_stations_merged - describe.loc['mean']) / describe.loc['std']
df_norm.to_csv("data/bj_meo_norm_data.csv")

# 5. 拼接预测日期数据

In [21]:
# Step1.3:load 'gridWeather_20180501-20180502.csv'
gridWeather_3 = pd.read_csv('data/gridWeather_20180501-20180502.csv', parse_dates=['time'], index_col=False)
gridWeather_3.columns = ['id', 'stationName', 'utc_time','weather', 'temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed']
del gridWeather_3['id']
del gridWeather_3['weather']
gridWeather_3.head()

Unnamed: 0,stationName,utc_time,temperature,pressure,humidity,wind_direction,wind_speed
0,beijing_grid_000,2018-05-01,20.0,975.6105,52.0,76.64,4.38
1,beijing_grid_001,2018-05-01,20.0,962.0873,49.0,58.35,3.82
2,beijing_grid_002,2018-05-01,13.0,948.5641,46.0,36.82,3.75
3,beijing_grid_003,2018-05-01,13.0,933.2105,43.0,9.52,4.65
4,beijing_grid_004,2018-05-01,13.0,916.0265,42.0,350.17,6.88


In [22]:
grid_meo_dataset, stations, meo_stations = load_grid_meo_data(gridWeather_3, near_stations)

In [27]:
# 拼接成整张表格， 并保存
meo_stations_merged = pd.concat(list(meo_stations.values()), axis=1)
meo_stations_merged.sort_index(inplace=True)

# 数据归一化
# describe = pd.read_csv('data/bj_meo_describe.csv')
# describe.set_index('Unnamed: 0', inplace=True)

df_norm = (meo_stations_merged - describe.loc['mean'])/describe.loc['std']
df_norm.to_csv('data/after_split/norm_data/bj_pred_meo_norm_data.csv')