In [24]:
import os
import re
import pandas as pd
from pandas import Series
from pandas import DataFrame
import json
from io import StringIO

# client = MongoClient('127.0.0.1', 27017)

# db = client['air_quality_model_hkust']
# sub_hour_weather_collection_name = 'subhour_weather_hkust'
# sub_hour_weather_collection = db[sub_hour_weather_collection_name]
# subhour_weather_station = db['subhour_weather_station']
weather_schemas = ['relative_humidity', 'temperature', 'wind_speed', 'wind_direction']
aqi_schemas = ['AQHI', 'AQI', 'CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2_5', 'SO2']

In [25]:
def find_nearby_stations(lat, lon, distance):
    # from pymongo import MongoClient
    client = MongoClient('127.0.0.1', 27017)
    db = client['air_quality_model_hkust']
    aqi_station_collection = db['aqi_station']
    aqi_stations = []
    for r in aqi_station_collection.find({
        'loc': {
            '$near': {
                '$geometry': {
                    'type': "Point",
                    'coordinates': [lon, lat]
                },
                '$maxDistance': distance
            }
        }

    }):
        aqi_stations.append(r['station_code'])

    weather_station_collection = db['subhour_weather_station']
    weather_stations = []
    for r in weather_station_collection.find({
        'loc': {
            '$near': {
                '$geometry': {
                    'type': "Point",
                    'coordinates': [lon, lat]
                },
                '$maxDistance': distance
            }
        }

    }):
        weather_stations.append(r['station_code'])

    return {'AQI': aqi_stations, 'weather': weather_stations}

In [26]:
def aggregate_records(records, data_type = 'weather'):
    """
    data_type: AQI or Weather
    aggregation: {feature:{"sum":xx, "number"}}

    """
    all_schemas = weather_schemas if data_type == 'weather' else aqi_schemas
    schema_map = {}
    for schema in all_schemas:
        schema_map[schema] = {'sum': 0, 'num': 0}
    for record in records:
        for schema in record:
            if schema not in schema_map:
                continue

            value = record[schema]['obs'] if (type(record[schema]) == dict) else record[schema]

            if isfloat(value):

                schema_map[schema]['sum'] += float(value)
                schema_map[schema]['num'] += 1
            elif value != None:
                print("Error value")

    output_schema = {}
    for schema in schema_map:
        schema_obj = schema_map[schema]
        output_schema[schema] = schema_obj['sum'] / schema_obj['num'] if schema_obj['num'] != 0 else None

    return output_schema

In [27]:
def query_spatial_temporal_record(lat, lon, distance, start_time, end_time):
    station_obj = find_nearby_stations(lat = lat, lon = lon, distance = distance)
    AQI_stations = station_obj['AQI']
    weather_stations = station_obj['weather']

    weather_records = []
    for station in weather_stations:
        weather_records += find_weather_records(station_code=station, start_time = start_time, end_time = end_time)
    weather_aggregation = aggregate_records(weather_records, data_type='weather')

    AQI_records = []
    for station in AQI_stations:
        AQI_records += find_AQI_records(station_code=station, start_time=start_time, end_time=end_time)
    AQI_aggregation = aggregate_records(AQI_records, data_type='aqi')
    return {"AQI": AQI_aggregation, "weather": weather_aggregation}

In [28]:
def query_spatial_temporal_record_by_station_code(station_code, distance, start_time, end_time):
    station_config = get_stations_conf('aqi_station')
    lat = None
    lon = None
    station_type = None
    if station_code in station_config:
        [lon, lat] = station_config[station_code]['loc']
        station_type = 'AQI'

    station_config = get_stations_conf('subhour_weather_station')
    if station_code in station_config:
        [lon, lat] = station_config[station_code]['loc']
        station_type = 'weather'

    result = query_spatial_temporal_record(lat=lat, lon=lon, distance=distance, start_time=start_time, end_time=end_time)

    return result

In [29]:
def generate_aggregation_collection(time_range = 3600, distance = 5000):
    # from pymongo import MongoClient

    client = MongoClient('127.0.0.1', 27017)
    db = client['air_quality_model_hkust']
    aqi_collection = db['air_quality_model_hkust_enrich']
    output_collection = db['aqi_aggregation_hkust_5000_1800_subhour']

    output_collection.remove({})

    insert_cache = []
    process_number = 0
    start_time = time.time()
    for record in aqi_collection.find().sort('time'):
        AQI_station_code = record['station_code']
        current_time = record['time']
        # Time range +,- 30min; distance: 5000
        aggregation_result = query_spatial_temporal_record_by_station_code(AQI_station_code, 5000, current_time - 1800, current_time + 1800)

        process_number += 1
        if process_number % 100 == 0:
            print(process_number)

        del record['_id']
        new_record = {}
        for schema in record:
            if schema not in aqi_schemas:
                new_record[schema] = record[schema]
        new_record['station_type'] = 'AQI'
        new_record['aggregation_AQI'] = aggregation_result['AQI']
        new_record['aggregation_weather'] = aggregation_result['weather']
        new_record['station_record'] = record
        insert_cache.append(new_record)
        if len(insert_cache) == 100:
            output_collection.insert_many(insert_cache)
            insert_cache = []
            end_time = time.time()
            print(process_number, end_time - start_time)
            start_time = end_time
    if len(insert_cache) != 0:
        output_collection.insert_many(insert_cache)

In [30]:
client = MongoClient('127.0.0.1', 27017)
db = client['air_quality_model_hkust']
weather_collection = db['subhour_weather_hkust']
aqi_collection = db['air_quality_model_hkust']

def find_weather_records(station_code, start_time, end_time):

    result = weather_collection.find({
        'station_code': station_code,
        'time': {
            '$gte': start_time,
            '$lte': end_time
        }
    })
    return list(result)

In [32]:
generate_aggregation_collection()

  if __name__ == '__main__':


100
100 1.9807953834533691
200
200 1.7067010402679443
300
300 1.2636487483978271
400
400 1.1775686740875244
500
500 1.2214674949645996
600
600 1.2898404598236084
700
700 1.4607222080230713
800
800 1.3455088138580322
900
900 1.3442332744598389
1000
1000 1.2062203884124756
1100
1100 1.3027544021606445
1200
1200 2.1433298587799072
1300
1300 1.3418076038360596
1400
1400 1.2816157341003418
1500
1500 1.1788694858551025
1600
1600 1.2122251987457275
1700
1700 1.1122143268585205
1800
1800 1.2062418460845947
1900
1900 1.1324224472045898
2000
2000 1.2013325691223145
2100
2100 1.2759203910827637
2200
2200 1.3264291286468506
2300
2300 2.2231080532073975
2400
2400 1.2977073192596436
2500
2500 1.4050259590148926
2600
2600 1.3325719833374023
2700
2700 1.2485127449035645
2800
2800 1.3410108089447021
2900
2900 1.337892770767212
3000
3000 1.519714593887329
3100
3100 1.3186087608337402
3200
3200 1.2430469989776611
3300
3300 2.050368070602417
3400
3400 1.2873821258544922
3500
3500 1.3854613304138184
3600
3

27500
27500 1.418025016784668
27600
27600 1.3436393737792969
27700
27700 1.900421142578125
27800
27800 1.442610502243042
27900
27900 1.522179365158081
28000
28000 1.4323694705963135
28100
28100 1.625213384628296
28200
28200 1.4387319087982178
28300
28300 1.3226268291473389
28400
28400 1.4304394721984863
28500
28500 1.452444076538086
28600
28600 2.4244384765625
28700
28700 1.3220243453979492
28800
28800 1.1466221809387207
28900
28900 1.1369335651397705
29000
29000 1.3942582607269287
29100
29100 1.4878802299499512
29200
29200 1.5136606693267822
29300
29300 1.6044292449951172
29400
29400 1.4716007709503174
29500
29500 2.2489891052246094
29600
29600 1.7157979011535645
29700
29700 1.4521427154541016
29800
29800 1.533803939819336
29900
29900 1.5047004222869873
30000
30000 1.22880220413208
30100
30100 1.3782873153686523
30200
30200 1.237536907196045
30300
30300 1.4077861309051514
30400
30400 1.4041523933410645
30500
30500 1.9083514213562012
30600
30600 1.2873542308807373
30700
30700 1.3090674

54200
54200 1.946174144744873
54300
54300 1.580162525177002
54400
54400 1.4085326194763184
54500
54500 1.331923246383667
54600
54600 1.3335919380187988
54700
54700 1.3809881210327148
54800
54800 1.192579984664917
54900
54900 1.1265673637390137
55000
55000 1.2532768249511719
55100
55100 1.2032203674316406
55200
55200 1.3447153568267822
55300
55300 1.358985185623169
55400
55400 1.7061126232147217
55500
55500 1.3846185207366943
55600
55600 1.2558395862579346
55700
55700 1.2612504959106445
55800
55800 1.2727129459381104
55900
55900 1.422781229019165
56000
56000 1.4705557823181152
56100
56100 1.3475236892700195
56200
56200 1.2840511798858643
56300
56300 1.484825849533081
56400
56400 1.7575347423553467
56500
56500 1.4314947128295898
56600
56600 1.3006269931793213
56700
56700 1.1895020008087158
56800
56800 1.1916682720184326
56900
56900 1.344813585281372
57000
57000 1.2346584796905518
57100
57100 1.1973850727081299
57200
57200 1.1621146202087402
57300
57300 1.3302340507507324
57400
57400 1.57

80900
80900 1.3963370323181152
81000
81000 1.359976053237915
81100
81100 1.5861310958862305
81200
81200 1.5905499458312988
81300
81300 2.2552330493927
81400
81400 1.4851994514465332
81500
81500 1.4569587707519531
81600
81600 1.4269344806671143
81700
81700 1.4508047103881836
81800
81800 1.6198985576629639
81900
81900 1.4751882553100586
82000
82000 1.4846301078796387
82100
82100 1.8909931182861328
82200
82200 2.0257408618927
82300
82300 1.6053173542022705
82400
82400 1.4449617862701416
82500
82500 1.4474859237670898
82600
82600 1.6343843936920166
82700
82700 1.6422083377838135
82800
82800 1.3962278366088867
82900
82900 1.3732776641845703
83000
83000 2.0048978328704834
83100
83100 1.8764467239379883
83200
83200 1.3568003177642822
83300
83300 1.5744097232818604
83400
83400 1.5506205558776855
83500
83500 1.4404799938201904
83600
83600 1.5370936393737793
83700
83700 1.474151611328125
83800
83800 1.2932627201080322
83900
83900 1.320575475692749
84000
84000 2.150451898574829
84100
84100 1.8372

107200
107200 1.628593921661377
107300
107300 1.5524780750274658
107400
107400 1.6538758277893066
107500
107500 1.8515377044677734
107600
107600 1.4114830493927002
107700
107700 1.6493802070617676
107800
107800 1.4817261695861816
107900
107900 1.534806728363037
108000
108000 1.566225290298462
108100
108100 1.4808335304260254
108200
108200 1.4048759937286377
108300
108300 1.5202739238739014
108400
108400 2.0956532955169678
108500
108500 1.4531564712524414
108600
108600 1.4807853698730469
108700
108700 1.350017786026001
108800
108800 1.441305160522461
108900
108900 1.4446308612823486
109000
109000 1.6888840198516846
109100
109100 1.5994043350219727
109200
109200 2.1189730167388916
109300
109300 1.7840421199798584
109400
109400 1.3063173294067383
109500
109500 1.314758539199829
109600
109600 1.4370923042297363
109700
109700 1.5565845966339111
109800
109800 1.5381457805633545
109900
109900 1.6494524478912354
110000
110000 1.5514888763427734
110100
110100 1.4034624099731445
110200
110200 2.

TypeError: documents must be a non-empty list