In [3]:
from library import isfloat
from library import modify_station_code
import os
import re
import pandas as pd
from pandas import Series
from pandas import DataFrame
import json
from io import StringIO
from pymongo import MongoClient
import pymongo
import time
client = MongoClient('127.0.0.1', 27017)

aq_db = client['air_quality_model_hkust']
aq_collection_name = 'air_quality_model_hkust_enrich'
aq_collection = aq_db[aq_collection_name]
aq_station_collection = aq_db['aqi_station']


weather_db = client['weather_hkust']
sub_hour_weather_collection_name = 'subhour_weather_hkust'
sub_hour_weather_collection = weather_db[sub_hour_weather_collection_name]
subhour_weather_station = weather_db['subhour_weather_station']

fusion_db = client['parise_hk_fusion_db']


weather_schemas = ['relative_humidity', 'temperature', 'wind_speed', 'wind_direction']
aq_schemas = ['AQHI', 'AQI', 'CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2_5', 'SO2']

def isNaN(num):
    return num != num

In [10]:
class FusionDB:
    """
    This is a class consists of multiple static functions to create fusion database from different datasource.
    
    Collection(subhour_weather_hkust): store the weather data collected subhourly;
    Collection(subhour_weather_station): store the weather station, station code, station location;
    
    The weather with different parameter(wind, humudity, temperature) and from different stations are stored in different files,
    so this class first merge the data of same time and in same station together, and then store them into the database.  
    
    Further improvement, aggregate the data by time (every five minutes)
    """
    @classmethod
    def find_nearby_stations_by_type(self, lat,lon, distance, para = 'weather'):
        """
        This is a function initialize the collection of weather records
        :param station_code: code of station, should be of string type, specified in forecast_station_config
        :return: forecast data of the site
        """
        stations = []
        station_collection = subhour_weather_station if para == 'weather' else aq_station_collection if para == "aq" else None
        if station_collection == None:
            print('Error parameter para', para)
            return None
        for r in station_collection.find({
            'loc': {
                '$near': {
                    '$geometry': {
                        'type': "Point",
                        'coordinates': [lon, lat]
                    },
                    '$maxDistance': distance
                }
            }

        }):
            if "_id" in r:
                del r['_id']
            stations.append(r['station_code'])
            
        return stations
        
        
    @classmethod
    def find_nearby_stations(self, lat, lon, distance):
        # from pymongo import MongoClient
        aqi_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'aq')
        weather_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'weather')
        return {'AQI': aqi_stations, 'weather': weather_stations}
    
    @classmethod
    def aggregate_records(self, records, data_type = 'weather'):
        """
        data_type: AQI or Weather
        aggregation: {feature:{"sum":xx, "number"}}

        """
        all_schemas = weather_schemas if data_type == 'weather' else aq_schemas
        schema_map = {}
        for schema in all_schemas:
            schema_map[schema] = {'sum': 0, 'num': 0}
        for record in records:
            for schema in record:
                if schema not in schema_map:
                    continue

                value = record[schema]['obs'] if (type(record[schema]) == dict) else record[schema]
                
                if isfloat(value):
                    if not isNaN(value):
                        schema_map[schema]['sum'] += float(value)
                        schema_map[schema]['num'] += 1
                elif value != None:
                    print("Error value", value)

        output_schema = {}
        for schema in schema_map:
            schema_obj = schema_map[schema]
            output_schema[schema] = schema_obj['sum'] / schema_obj['num'] if schema_obj['num'] != 0 else None
        return output_schema
    
    
    @classmethod 
    def find_weather_aq_records(self, station_code, start_time, end_time, para = "weather"):
        collection = sub_hour_weather_collection if para == 'weather' else aq_collection if para == 'aq' else None
        records = collection.find({
            'station_code': station_code,
            'time': {
                '$gte': start_time,
                '$lte': end_time
            }
        })
        records = list(records)
        for r in records:
            if "_id" in r:
                del r['_id']
        return records
    
    
    @classmethod
    def query_spatial_temporal_record(self, lat, lon, distance, start_time, end_time):
        
        station_obj = self.find_nearby_stations(lat = lat, lon = lon, distance = distance)
        AQI_stations = station_obj['AQI']
        weather_stations = station_obj['weather']

        weather_records = []
        for station in weather_stations:
            weather_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'weather')  
        weather_aggregation = self.aggregate_records(weather_records, data_type='weather')


        AQI_records = []
        for station in AQI_stations:
            AQI_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'aq')
                    
        AQI_aggregation = self.aggregate_records(AQI_records, data_type='aqi')
        return {"AQI": AQI_aggregation, "weather": weather_aggregation}
        
   

    @classmethod
    def query_get_station_config(self, para = 'weather'):
        station_map = {}
        stations = subhour_weather_station if para == 'weather' else aq_station_collection if para == 'aq' else None
        for station in stations.find():
            station_code = station['station_code']
            if '_id' in station:
                del station['_id']
            m_station_code = modify_station_code(station_code)
            
            if m_station_code not in station_map:
                station_map[m_station_code] = station
        return station_map

    @classmethod 
    def query_spatial_temporal_record_by_station_code(self, station_code, distance, start_time, end_time):
        station_config = self.query_get_station_config('aq')
        lat = None
        lon = None
        station_type = None
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'AQI'

        station_config = self.query_get_station_config('weather')
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'weather'

        result = self.query_spatial_temporal_record(lat=lat, lon=lon, distance=distance, start_time=start_time, end_time=end_time)

        return result
    
    @classmethod
    def generate_aggregation_collection(self, output_collection_name, offset_time = 0, pre_time = 1800, after_time = 1800, distance = 5000):
        # from pymongo import MongoClient

        output_collection = fusion_db[output_collection_name]

        output_collection.remove({})

        insert_cache = []
        process_number = 0
        start_time = time.time()
        for record in aq_collection.find().sort('time'):
            aq_station_code = record['station_code']
            current_time = record['time']
            
            # Time range +,- 30min; distance: 5000
            aggregation_result = self.query_spatial_temporal_record_by_station_code(
                aq_station_code, 
                5000, 
                current_time + offset_time - pre_time, 
                current_time + offset_time + after_time)
  
            process_number += 1
            if process_number % 100 == 0:
                print(process_number)

            del record['_id']
            new_record = {}
            for schema in record:
                if schema not in aq_schemas:
                    new_record[schema] = record[schema]
            new_record['station_type'] = 'AQI'
            new_record['aggregation_AQI'] = aggregation_result['AQI']
            new_record['aggregation_weather'] = aggregation_result['weather']
            new_record['station_record'] = record
            insert_cache.append(new_record)
            if len(insert_cache) == 100:
                output_collection.insert_many(insert_cache)
                insert_cache = []
                end_time = time.time()
                print(process_number, end_time - start_time)
                start_time = end_time
        if len(insert_cache) != 0:
            output_collection.insert_many(insert_cache)


In [11]:
if __name__ == "__main__":
    FusionDB.generate_aggregation_collection('aqi_aggregation_hkust_5000_-10800_1800_0_subhour', offset_time = -10800 ,pre_time = 1800, after_time = 0, distance = 5000)



100
100 1.1765899658203125
200
200 1.353107213973999
300
300 1.1381397247314453
400
400 1.1791374683380127
500
500 1.0777547359466553
600
600 1.1096043586730957
700
700 1.056088924407959
800
800 0.7470500469207764
900
900 1.0349316596984863
1000
1000 1.1190967559814453
1100
1100 1.1397626399993896
1200
1200 1.1274406909942627
1300
1300 1.1463205814361572
1400
1400 1.0879311561584473
1500
1500 1.1139490604400635
1600
1600 1.1143145561218262
1700
1700 1.05586838722229
1800
1800 1.10255765914917
1900
1900 1.0988874435424805
2000
2000 1.031874179840088
2100
2100 1.0973358154296875
2200
2200 1.0920438766479492
2300
2300 1.0809073448181152
2400
2400 1.157745122909546
2500
2500 1.1261217594146729
2600
2600 1.1196763515472412
2700
2700 1.1219501495361328
2800
2800 1.094024419784546
2900
2900 1.1675970554351807
3000
3000 0.8542418479919434
3100
3100 0.7251613140106201
3200
3200 1.0715579986572266
3300
3300 1.0807981491088867
3400
3400 1.0794265270233154
3500
3500 1.0907480716705322
3600
3600 1.

27500
27500 1.035874366760254
27600
27600 1.054441213607788
27700
27700 1.0726392269134521
27800
27800 1.0603575706481934
27900
27900 1.0371620655059814
28000
28000 1.1114871501922607
28100
28100 1.045325756072998
28200
28200 0.9115614891052246
28300
28300 1.0100023746490479
28400
28400 1.0630943775177002
28500
28500 1.042585849761963
28600
28600 1.062206506729126
28700
28700 1.0550379753112793
28800
28800 1.0793664455413818
28900
28900 1.0908186435699463
29000
29000 1.047149419784546
29100
29100 1.0620927810668945
29200
29200 1.1679267883300781
29300
29300 0.9080426692962646
29400
29400 0.7696940898895264
29500
29500 1.0229554176330566
29600
29600 1.0477311611175537
29700
29700 1.0510742664337158
29800
29800 1.078223705291748
29900
29900 1.06266188621521
30000
30000 1.004258394241333
30100
30100 1.0071156024932861
30200
30200 1.0525598526000977
30300
30300 1.0489356517791748
30400
30400 1.0523662567138672
30500
30500 1.0559167861938477
30600
30600 1.0706839561462402
30700
30700 1.1165

54200
54200 1.119563341140747
54300
54300 1.0725877285003662
54400
54400 1.0869717597961426
54500
54500 1.0580427646636963
54600
54600 0.7215240001678467
54700
54700 0.9945023059844971
54800
54800 0.9853692054748535
54900
54900 0.9806709289550781
55000
55000 0.9420931339263916
55100
55100 0.9176180362701416
55200
55200 0.9917953014373779
55300
55300 1.0177452564239502
55400
55400 1.0311782360076904
55500
55500 1.0505321025848389
55600
55600 1.0787463188171387
55700
55700 1.0433952808380127
55800
55800 1.053649663925171
55900
55900 1.0548455715179443
56000
56000 3.638417959213257
56100
56100 5.230928182601929
56200
56200 2.1729674339294434
56300
56300 1.30234694480896
56400
56400 1.1044988632202148
56500
56500 1.1165118217468262
56600
56600 0.652850866317749
56700
56700 0.6680822372436523
56800
56800 1.0614960193634033
56900
56900 1.0308895111083984
57000
57000 1.0148065090179443
57100
57100 1.054635763168335
57200
57200 1.0251338481903076
57300
57300 0.9713723659515381
57400
57400 1.05

80800
80800 1.063889503479004
80900
80900 1.024012804031372
81000
81000 1.0438323020935059
81100
81100 1.0515596866607666
81200
81200 1.029402494430542
81300
81300 1.02339506149292
81400
81400 1.054384469985962
81500
81500 1.0208051204681396
81600
81600 1.0224967002868652
81700
81700 1.0601534843444824
81800
81800 1.080545425415039
81900
81900 5.214125633239746
82000
82000 4.591281175613403
82100
82100 1.084193468093872
82200
82200 1.001988410949707
82300
82300 0.5664505958557129
82400
82400 0.7722032070159912
82500
82500 0.9434168338775635
82600
82600 1.0553748607635498
82700
82700 1.0666899681091309
82800
82800 1.0383226871490479
82900
82900 1.0672674179077148
83000
83000 1.0484333038330078
83100
83100 0.9865882396697998
83200
83200 0.6914730072021484
83300
83300 1.0540120601654053
83400
83400 1.039747953414917
83500
83500 1.0578553676605225
83600
83600 0.5205135345458984
83700
83700 0.7718625068664551
83800
83800 0.9298367500305176
83900
83900 1.0083816051483154
84000
84000 0.969761

107000
107000 1.0317692756652832
107100
107100 1.0325121879577637
107200
107200 1.0464403629302979
107300
107300 1.0594885349273682
107400
107400 1.0398600101470947
107500
107500 1.0483911037445068
107600
107600 1.0517909526824951
107700
107700 1.057288646697998
107800
107800 1.054748773574829
107900
107900 1.0438802242279053
108000
108000 1.1975147724151611
108100
108100 1.123358964920044
108200
108200 5.236587762832642
108300
108300 4.879893779754639
108400
108400 1.1150257587432861
108500
108500 1.067345142364502
108600
108600 0.6814982891082764
108700
108700 0.8249626159667969
108800
108800 1.0396387577056885
108900
108900 1.0512123107910156
109000
109000 1.0510411262512207
109100
109100 1.0350542068481445
109200
109200 1.026313066482544
109300
109300 1.0513420104980469
109400
109400 1.0302915573120117
109500
109500 1.0758838653564453
109600
109600 1.040010929107666
109700
109700 1.0275940895080566
109800
109800 0.719846248626709
109900
109900 0.8723349571228027
110000
110000 1.065