In [1]:
from library import isfloat
from library import modify_station_code
import os
import re
import pandas as pd
from pandas import Series
from pandas import DataFrame
import json
from io import StringIO
from pymongo import MongoClient
import pymongo
import time
client = MongoClient('127.0.0.1', 27017)

aq_db = client['air_quality_model_hkust']
aq_collection_name = 'air_quality_model_hkust_enrich'
aq_collection = aq_db[aq_collection_name]
aq_station_collection = aq_db['aqi_station']


weather_db = client['weather_hkust']
sub_hour_weather_collection_name = 'subhour_weather_hkust'
sub_hour_weather_collection = weather_db[sub_hour_weather_collection_name]
subhour_weather_station = weather_db['subhour_weather_station']

fusion_db = client['parise_hk_fusion_db']


weather_schemas = ['relative_humidity', 'temperature', 'wind_speed', 'wind_direction']
aq_schemas = ['AQHI', 'AQI', 'CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2_5', 'SO2']

def isNaN(num):
    return num != num

In [2]:
class FusionDB:
    """
    This is a class consists of multiple static functions to create fusion database from different datasource.
    
    Collection(subhour_weather_hkust): store the weather data collected subhourly;
    Collection(subhour_weather_station): store the weather station, station code, station location;
    
    The weather with different parameter(wind, humudity, temperature) and from different stations are stored in different files,
    so this class first merge the data of same time and in same station together, and then store them into the database.  
    
    Further improvement, aggregate the data by time (every five minutes)
    """
    @classmethod
    def find_nearby_stations_by_type(self, lat,lon, distance, para = 'weather'):
        """
        This is a function initialize the collection of weather records
        :param station_code: code of station, should be of string type, specified in forecast_station_config
        :return: forecast data of the site
        """
        stations = []
        station_collection = subhour_weather_station if para == 'weather' else aq_station_collection if para == "aq" else None
        if station_collection == None:
            print('Error parameter para', para)
            return None
        for r in station_collection.find({
            'loc': {
                '$near': {
                    '$geometry': {
                        'type': "Point",
                        'coordinates': [lon, lat]
                    },
                    '$maxDistance': distance
                }
            }

        }):
            if "_id" in r:
                del r['_id']
            stations.append(r['station_code'])
            
        return stations
        
        
    @classmethod
    def find_nearby_stations(self, lat, lon, distance):
        # from pymongo import MongoClient
        aqi_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'aq')
        weather_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'weather')
        return {'AQI': aqi_stations, 'weather': weather_stations}
    
    @classmethod
    def aggregate_records(self, records, data_type = 'weather'):
        """
        data_type: AQI or Weather
        aggregation: {feature:{"sum":xx, "number"}}

        """
        all_schemas = weather_schemas if data_type == 'weather' else aq_schemas
        schema_map = {}
        for schema in all_schemas:
            schema_map[schema] = {'sum': 0, 'num': 0}
        for record in records:
            for schema in record:
                if schema not in schema_map:
                    continue

                value = record[schema]['obs'] if (type(record[schema]) == dict) else record[schema]
                
                if isfloat(value):
                    if not isNaN(value):
                        schema_map[schema]['sum'] += float(value)
                        schema_map[schema]['num'] += 1
                elif value != None:
                    print("Error value", value)

        output_schema = {}
        for schema in schema_map:
            schema_obj = schema_map[schema]
            output_schema[schema] = schema_obj['sum'] / schema_obj['num'] if schema_obj['num'] != 0 else None
        return output_schema
    
    
    @classmethod 
    def find_weather_aq_records(self, station_code, start_time, end_time, para = "weather"):
        collection = sub_hour_weather_collection if para == 'weather' else aq_collection if para == 'aq' else None
        records = collection.find({
            'station_code': station_code,
            'time': {
                '$gte': start_time,
                '$lte': end_time
            }
        })
        records = list(records)
        for r in records:
            if "_id" in r:
                del r['_id']
        return records
    
    
    @classmethod
    def query_spatial_temporal_record(self, lat, lon, distance, start_time, end_time):
        
        station_obj = self.find_nearby_stations(lat = lat, lon = lon, distance = distance)
        AQI_stations = station_obj['AQI']
        weather_stations = station_obj['weather']

        weather_records = []
        for station in weather_stations:
            weather_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'weather')  
        weather_aggregation = self.aggregate_records(weather_records, data_type='weather')


        AQI_records = []
        for station in AQI_stations:
            AQI_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'aq')
                    
        AQI_aggregation = self.aggregate_records(AQI_records, data_type='aqi')
        return {"AQI": AQI_aggregation, "weather": weather_aggregation}
        
   

    @classmethod
    def query_get_station_config(self, para = 'weather'):
        station_map = {}
        stations = subhour_weather_station if para == 'weather' else aq_station_collection if para == 'aq' else None
        for station in stations.find():
            station_code = station['station_code']
            if '_id' in station:
                del station['_id']
            m_station_code = modify_station_code(station_code)
            
            if m_station_code not in station_map:
                station_map[m_station_code] = station
        return station_map

    @classmethod 
    def query_spatial_temporal_record_by_station_code(self, station_code, distance, start_time, end_time):
        station_config = self.query_get_station_config('aq')
        lat = None
        lon = None
        station_type = None
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'AQI'

        station_config = self.query_get_station_config('weather')
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'weather'

        result = self.query_spatial_temporal_record(lat=lat, lon=lon, distance=distance, start_time=start_time, end_time=end_time)

        return result
    
    @classmethod
    def generate_aggregation_collection(self, output_collection_name, pre_time = 1800, after_time = 1800, distance = 5000):
        # from pymongo import MongoClient

        output_collection = fusion_db[output_collection_name]

        output_collection.remove({})

        insert_cache = []
        process_number = 0
        start_time = time.time()
        for record in aq_collection.find().sort('time'):
            aq_station_code = record['station_code']
            current_time = record['time']
            
            # Time range +,- 30min; distance: 5000
            aggregation_result = self.query_spatial_temporal_record_by_station_code(aq_station_code, 5000, current_time - pre_time, current_time + after_time)
     
            process_number += 1
            if process_number % 100 == 0:
                print(process_number)

            del record['_id']
            new_record = {}
            for schema in record:
                if schema not in aq_schemas:
                    new_record[schema] = record[schema]
            new_record['station_type'] = 'AQI'
            new_record['aggregation_AQI'] = aggregation_result['AQI']
            new_record['aggregation_weather'] = aggregation_result['weather']
            new_record['station_record'] = record
            insert_cache.append(new_record)
            if len(insert_cache) == 100:
                output_collection.insert_many(insert_cache)
                insert_cache = []
                end_time = time.time()
                print(process_number, end_time - start_time)
                start_time = end_time
        if len(insert_cache) != 0:
            output_collection.insert_many(insert_cache)


In [4]:
if __name__ == "__main__":
    FusionDB.generate_aggregation_collection('aqi_aggregation_hkust_5000_1800_0_subhour', pre_time = 1800, after_time = 0, distance = 5000)



100
100 1.1768712997436523
200
200 1.018364429473877
300
300 1.015610694885254
400
400 0.9464619159698486
500
500 0.9729461669921875
600
600 0.939208984375
700
700 0.971153736114502
800
800 0.9434294700622559
900
900 0.9604125022888184
1000
1000 0.921412467956543
1100
1100 1.165008544921875
1200
1200 0.7264235019683838
1300
1300 0.8549158573150635
1400
1400 0.9733507633209229
1500
1500 1.066403865814209
1600
1600 1.0722379684448242
1700
1700 1.1087055206298828
1800
1800 1.1120390892028809
1900
1900 1.1002247333526611
2000
2000 1.0913565158843994
2100
2100 1.007148265838623
2200
2200 0.871814489364624
2300
2300 0.9784829616546631
2400
2400 1.0418648719787598
2500
2500 1.0274221897125244
2600
2600 1.0324101448059082
2700
2700 1.0866420269012451
2800
2800 1.1250197887420654
2900
2900 1.1057286262512207
3000
3000 1.1893105506896973
3100
3100 1.0491511821746826
3200
3200 0.7652413845062256
3300
3300 0.6149389743804932
3400
3400 0.9394285678863525
3500
3500 0.9145176410675049
3600
3600 0.935

27500
27500 0.8092813491821289
27600
27600 0.7999329566955566
27700
27700 0.885979175567627
27800
27800 1.0492384433746338
27900
27900 0.6417028903961182
28000
28000 0.9579551219940186
28100
28100 1.0484569072723389
28200
28200 1.069427490234375
28300
28300 1.0641248226165771
28400
28400 1.0694611072540283
28500
28500 0.7426679134368896
28600
28600 0.7581644058227539
28700
28700 0.6873393058776855
28800
28800 1.0669763088226318
28900
28900 1.1478772163391113
29000
29000 0.6728076934814453
29100
29100 0.6168177127838135
29200
29200 0.6311037540435791
29300
29300 0.5979588031768799
29400
29400 0.6856622695922852
29500
29500 0.6881673336029053
29600
29600 0.655404806137085
29700
29700 0.6367549896240234
29800
29800 0.6583316326141357
29900
29900 0.5914247035980225
30000
30000 0.5863232612609863
30100
30100 0.6363875865936279
30200
30200 0.7154326438903809
30300
30300 0.8486049175262451
30400
30400 1.0047447681427002
30500
30500 0.7426345348358154
30600
30600 1.0469954013824463
30700
30700

54200
54200 1.1885979175567627
54300
54300 1.0566861629486084
54400
54400 1.0648257732391357
54500
54500 1.0425095558166504
54600
54600 1.0390009880065918
54700
54700 1.0702311992645264
54800
54800 0.9180881977081299
54900
54900 0.8437488079071045
55000
55000 1.0423438549041748
55100
55100 1.0453696250915527
55200
55200 1.0223426818847656
55300
55300 1.0555663108825684
55400
55400 0.9481134414672852
55500
55500 1.0276126861572266
55600
55600 1.0791020393371582
55700
55700 0.9849903583526611
55800
55800 0.9133908748626709
55900
55900 0.7839334011077881
56000
56000 0.9275326728820801
56100
56100 0.9085793495178223
56200
56200 0.7439842224121094
56300
56300 0.7320444583892822
56400
56400 0.7316629886627197
56500
56500 0.714381217956543
56600
56600 0.7555088996887207
56700
56700 0.7485840320587158
56800
56800 1.0710582733154297
56900
56900 0.9500975608825684
57000
57000 0.920482873916626
57100
57100 1.2563400268554688
57200
57200 0.8247711658477783
57300
57300 0.9952285289764404
57400
5740

80800
80800 1.0434134006500244
80900
80900 1.033959150314331
81000
81000 1.03804349899292
81100
81100 1.1647963523864746
81200
81200 1.0811705589294434
81300
81300 0.7008428573608398
81400
81400 0.9485061168670654
81500
81500 1.0442492961883545
81600
81600 1.0416758060455322
81700
81700 1.038114309310913
81800
81800 1.0030663013458252
81900
81900 0.9090299606323242
82000
82000 1.0075373649597168
82100
82100 1.0104708671569824
82200
82200 1.0292258262634277
82300
82300 0.6962692737579346
82400
82400 0.5871422290802002
82500
82500 0.8819446563720703
82600
82600 0.9453659057617188
82700
82700 0.850581169128418
82800
82800 0.8968205451965332
82900
82900 0.9669013023376465
83000
83000 0.7993671894073486
83100
83100 0.6791026592254639
83200
83200 0.8362948894500732
83300
83300 0.945770263671875
83400
83400 0.836219072341919
83500
83500 0.5309963226318359
83600
83600 0.5167994499206543
83700
83700 0.5180158615112305
83800
83800 0.5249125957489014
83900
83900 0.507587194442749
84000
84000 0.51

107000
107000 0.7490777969360352
107100
107100 0.9092915058135986
107200
107200 1.0549790859222412
107300
107300 1.0468504428863525
107400
107400 0.942657470703125
107500
107500 0.651268720626831
107600
107600 0.5321660041809082
107700
107700 0.5400469303131104
107800
107800 0.5749714374542236
107900
107900 0.9225752353668213
108000
108000 1.093609094619751
108100
108100 1.0561234951019287
108200
108200 0.6045854091644287
108300
108300 0.8919703960418701
108400
108400 1.0281338691711426
108500
108500 1.001267910003662
108600
108600 1.0015416145324707
108700
108700 1.0455093383789062
108800
108800 0.9733459949493408
108900
108900 1.0195322036743164
109000
109000 1.006667137145996
109100
109100 1.1370339393615723
109200
109200 0.8492145538330078
109300
109300 0.9124484062194824
109400
109400 0.8549261093139648
109500
109500 0.7428624629974365
109600
109600 0.7645444869995117
109700
109700 0.7211382389068604
109800
109800 0.5980386734008789
109900
109900 0.67488694190979
110000
110000 0.6