In [3]:
# -*- coding: utf-8 -*-

import os
import sys
import xlrd

import folium
import numpy as np
import pandas as pd
from rtree import index

PM = ['PM10', 'PM25']
IONIC = ['F-','Cl-','NO3-','SO42-','Ca2+','Na+','K+','NH4+','Mg2+']
OCEC = ['OC','EC']
METAL = [
    'Hg','Br','As','Si','Se','Te','V','Ca','Ti','Ba','Sc','Pd','Co','Mo','K',
    'Fe','Pb','TI','Cu','Al','Cr','Cs','Ag','Zn','Sb','Sn','Mn','Cd','Ni','Ga'
]

In [4]:
def read_txt(filename: str, sep:str=None):
    '''
        文件读取
    '''
    assert os.path.exists(filename)
    df = pd.read_table(filename, sep=sep, encoding='utf-8', engine='python', error_bad_lines=False)

    return df

def read_org(filepath: str):
    '''
        Read *.txt data file.
    '''
    assert os.path.join(filepath)

    data = pd.read_csv(filepath, sep=',', 
        names=['time','stationcode','longitude','latitude']+PM+IONIC+OCEC+METAL,
        encoding='utf-8')
    data.replace(-999.0, np.nan, inplace=True)

    return data

In [5]:
qc_src = '../data/qc_data'
nearby_file = '../data/nearby_envi.txt'
qc_dst = '../data/qc_data_pm'

In [6]:
mapping = {
    110000006: 110000249, 371500001: 371500053, 150100001: 150100054, 1320100001: 320100054, 410200001: 410200062, 140100002: 140100063, 340100001: 340100204, 410300001: 410300057, 610100002: 610100053, 1320300001: 320300058, 130600003: 130600062, 130500001: 130500405, 110000005: 110000252,1640100001: 640100059, 130700002: 130700407, 141000001: 141000072, 1411200001: 411200403, 130300002: 130300053, 140500001: 140500053, 140300001: 140300053, 120000005: 120000186, 320100001: 320100054, 370100001: 370100056, 110000016: 110000253, 120000004: 120000186, 370300001: 370300052, 130600004: 130600063, 140400001: 140400052, 430100001: 430100070, 131000005: 131000409, 110000004: 110000246, 130900003: 130900403, 371400003: 371500053, 130300001: 130300055, 131000003: 131000409, 1320200001: 320200104, 110000010: 110000252, 130200001: 130200088, 110000002: 110000252, 131100003: 131100408, 110000008: 131000409, 410500002: 410500058, 130400002: 130400053, 370800001: 370800051, 110000009: 131000403, 110000011: 110000249, 120000001: 120000306, 131000004: 131000403, 410900001: 410900401, 410700002: 410700403, 130100005: 130100052, 120000006: 110000253, 410100002: 410100052, 130700003: 130700401, 130500002: 130500406, 410800001: 410800052, 130200002: 130200052, 410500001: 410500058, 140800001: 140800095, 371700001: 371700052, 410100003: 410100065, 371400002: 371400054, 1320500001: 320500055, 110000003: 131000402, 350100002: 350100058, 131100002: 131100408, 130600002: 130600061, 1320600001: 320600005, 410600001: 410600403, 370100002: 370100052, 130100004: 130100057, 1340100001: 340100054, 371600001: 371600051, 110000001: 110000246, 110000012: 110000246, 1306000001: 110000249, 1307000001: 130700401, 1311000001: 131100401, 3701000001: 370181002, 1200000004: 120000123, 120000003: 120000309, 140100001: 140100057}

In [13]:
neaby = read_txt(nearby_file, sep=',')

2 fields in line 542519, saw 13
Skipping line 542643: Expected 12 fields in line 542643, saw 13
Skipping line 542695: Expected 12 fields in line 542695, saw 13
Skipping line 542756: Expected 12 fields in line 542756, saw 13
Skipping line 542954: Expected 12 fields in line 542954, saw 13
Skipping line 543476: Expected 12 fields in line 543476, saw 13
Skipping line 543617: Expected 12 fields in line 543617, saw 13
Skipping line 543830: Expected 12 fields in line 543830, saw 13
Skipping line 544234: Expected 12 fields in line 544234, saw 13
Skipping line 544764: Expected 12 fields in line 544764, saw 13
Skipping line 545111: Expected 12 fields in line 545111, saw 13
Skipping line 545267: Expected 12 fields in line 545267, saw 13
Skipping line 545459: Expected 12 fields in line 545459, saw 13
Skipping line 545903: Expected 12 fields in line 545903, saw 13
Skipping line 545904: Expected 12 fields in line 545904, saw 13
Skipping line 546190: Expected 12 fields in line 546190, saw 13
Skipping

In [15]:
neaby.head()

Unnamed: 0,时间,站点编号,PM2.5浓度,PM10浓度,CO浓度,NO2浓度,SO2浓度,O3浓度,O3 8小时浓度,AQI,空气质量等级,首要污染物
0,2019010100,110000246,54.0,91.0,1.1,76.0,21.0,3.0,3.0,74.0,良,细颗粒物(PM2.5)
1,2019010100,110000249,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,1.0,-999.0,—,—
2,2019010100,110000252,53.0,76.0,1.2,75.0,15.0,0.0,0.0,73.0,良,细颗粒物(PM2.5)
3,2019010100,110000253,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,8.0,-999.0,—,—
4,2019010100,120000123,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,10.0,-999.0,—,—


In [31]:
n_test = neaby.loc[neaby['站点编号']==110000249, ['时间','站点编号','PM2.5浓度','PM10浓度']]
n_test.replace(-999.0, np.nan, inplace=True)
n_test.head()

Unnamed: 0,时间,站点编号,PM2.5浓度,PM10浓度
1,2019010100,110000249,,
67,2019010101,110000249,,
133,2019010102,110000249,,
199,2019010103,110000249,54.0,80.0
265,2019010104,110000249,50.0,66.0


In [21]:
org = read_org(os.path.join(qc_src, '110000006.txt'))
org.head()

Unnamed: 0,time,stationcode,longitude,latitude,PM10,PM25,F-,Cl-,NO3-,SO42-,...,Cr,Cs,Ag,Zn,Sb,Sn,Mn,Cd,Ni,Ga
0,2019-01-01:00,110000006,116.041,39.597,,,,,,,...,,,,,,,,,,
1,2019-01-01:01,110000006,116.041,39.597,,,,,,,...,,,,,,,,,,
2,2019-01-01:02,110000006,116.041,39.597,,,,,,,...,,,,,,,,,,
3,2019-01-01:03,110000006,116.041,39.597,,,,,,,...,,,,,,,,,,
4,2019-01-01:04,110000006,116.041,39.597,,,,,,,...,,,,,,,,,,


In [38]:
from datetime import datetime, timedelta

def parse_time(x):
    return datetime.strptime(str(x), "%Y%m%d%H").strftime('%Y-%m-%d:%H')

n_test['time'] = n_test['时间'].apply(parse_time)
n_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8709 entries, 1 to 573230
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   时间       8709 non-null   int64  
 1   站点编号     8709 non-null   int64  
 2   PM2.5浓度  8565 non-null   float64
 3   PM10浓度   8135 non-null   float64
 4   time     8709 non-null   object 
dtypes: float64(2), int64(2), object(1)
memory usage: 728.2+ KB


In [39]:
o_test = org[['time','stationcode','PM10','PM25']]
o_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   time         8760 non-null   object 
 1   stationcode  8760 non-null   int64  
 2   PM10         0 non-null      float64
 3   PM25         0 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 273.9+ KB


In [42]:
df = o_test.merge(n_test, left_on='time', right_on='time', how='outer')
df.head()

Unnamed: 0,time,stationcode,PM10,PM25,时间,站点编号,PM2.5浓度,PM10浓度
0,2019-01-01:00,110000006,,,2019010000.0,110000249.0,,
1,2019-01-01:01,110000006,,,2019010000.0,110000249.0,,
2,2019-01-01:02,110000006,,,2019010000.0,110000249.0,,
3,2019-01-01:03,110000006,,,2019010000.0,110000249.0,54.0,80.0
4,2019-01-01:04,110000006,,,2019010000.0,110000249.0,50.0,66.0


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   time         8760 non-null   object 
 1   stationcode  8760 non-null   int64  
 2   PM10         0 non-null      float64
 3   PM25         0 non-null      float64
 4   时间           8709 non-null   float64
 5   站点编号         8709 non-null   float64
 6   PM2.5浓度      8565 non-null   float64
 7   PM10浓度       8135 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 935.9+ KB


In [59]:
np.max(df[['PM10', 'PM10浓度']], axis=1)

0        NaN
1        NaN
2        NaN
3       80.0
4       66.0
        ... 
8755    72.0
8756    95.0
8757    76.0
8758    63.0
8759    57.0
Length: 8760, dtype: float64

In [61]:
df['PM10'] = np.mean(df[['PM10', 'PM10浓度']], axis=1)
df

Unnamed: 0,time,stationcode,PM10,PM25,时间,站点编号,PM2.5浓度,PM10浓度,a
0,2019-01-01:00,110000006,,,2.019010e+09,110000249.0,,,
1,2019-01-01:01,110000006,,,2.019010e+09,110000249.0,,,
2,2019-01-01:02,110000006,,,2.019010e+09,110000249.0,,,
3,2019-01-01:03,110000006,80.0,,2.019010e+09,110000249.0,54.0,80.0,80.0
4,2019-01-01:04,110000006,66.0,,2.019010e+09,110000249.0,50.0,66.0,66.0
...,...,...,...,...,...,...,...,...,...
8755,2019-12-31:19,110000006,72.0,,2.019123e+09,110000249.0,45.0,72.0,72.0
8756,2019-12-31:20,110000006,95.0,,2.019123e+09,110000249.0,45.0,95.0,95.0
8757,2019-12-31:21,110000006,76.0,,2.019123e+09,110000249.0,43.0,76.0,76.0
8758,2019-12-31:22,110000006,63.0,,2.019123e+09,110000249.0,38.0,63.0,63.0
