In [120]:
from library import isfloat
from library import modify_station_code
import os
import re
import pandas as pd
from pandas import Series
from pandas import DataFrame
import json
from io import StringIO
from pymongo import MongoClient
import pymongo
import time
client = MongoClient('127.0.0.1', 27017)

aq_db = client['air_quality_model_hkust']
aq_collection_name = 'air_quality_model_hkust'
aq_collection = aq_db[aq_collection_name]
aq_station_collection = aq_db['aqi_station']


weather_db = client['weather_hkust']
sub_hour_weather_collection_name = 'subhour_weather_hkust'
sub_hour_weather_collection = weather_db[sub_hour_weather_collection_name]
subhour_weather_station = weather_db['subhour_weather_station']

fusion_db = client['parise_hk_fusion_db']


weather_schemas = ['relative_humidity', 'temperature', 'wind_speed', 'wind_direction']
aq_schemas = ['AQHI', 'AQI', 'CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2_5', 'SO2']

def isNaN(num):
    return num != num

In [128]:
class FusionDB:
    """
    This is a class consists of multiple static functions to create fusion database from different datasource.
    
    Collection(subhour_weather_hkust): store the weather data collected subhourly;
    Collection(subhour_weather_station): store the weather station, station code, station location;
    
    The weather with different parameter(wind, humudity, temperature) and from different stations are stored in different files,
    so this class first merge the data of same time and in same station together, and then store them into the database.  
    
    Further improvement, aggregate the data by time (every five minutes)
    """
    @classmethod
    def find_nearby_stations_by_type(self, lat,lon, distance, para = 'weather'):
        """
        This is a function initialize the collection of weather records
        :param station_code: code of station, should be of string type, specified in forecast_station_config
        :return: forecast data of the site
        """
        stations = []
        station_collection = subhour_weather_station if para == 'weather' else aq_station_collection if para == "aq" else None
        if station_collection == None:
            print('Error parameter para', para)
            return None
        for r in station_collection.find({
            'loc': {
                '$near': {
                    '$geometry': {
                        'type': "Point",
                        'coordinates': [lon, lat]
                    },
                    '$maxDistance': distance
                }
            }

        }):
            if "_id" in r:
                del r['_id']
            stations.append(r['station_code'])
            
        return stations
        
        
    @classmethod
    def find_nearby_stations(self, lat, lon, distance):
        # from pymongo import MongoClient
        aqi_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'aq')
        weather_stations = self.find_nearby_stations_by_type(lat, lon, distance, 'weather')
        return {'AQI': aqi_stations, 'weather': weather_stations}
    
    @classmethod
    def aggregate_records(self, records, data_type = 'weather'):
        """
        data_type: AQI or Weather
        aggregation: {feature:{"sum":xx, "number"}}

        """
        all_schemas = weather_schemas if data_type == 'weather' else aq_schemas
        schema_map = {}
        for schema in all_schemas:
            schema_map[schema] = {'sum': 0, 'num': 0}
        for record in records:
            for schema in record:
                if schema not in schema_map:
                    continue

                value = record[schema]['obs'] if (type(record[schema]) == dict) else record[schema]
                
                if isfloat(value):
                    if not isNaN(value):
                        schema_map[schema]['sum'] += float(value)
                        schema_map[schema]['num'] += 1
                elif value != None:
                    print("Error value", value)

        output_schema = {}
        for schema in schema_map:
            schema_obj = schema_map[schema]
            output_schema[schema] = schema_obj['sum'] / schema_obj['num'] if schema_obj['num'] != 0 else None
        return output_schema
    
    
    @classmethod 
    def find_weather_aq_records(self, station_code, start_time, end_time, para = "weather"):
        collection = sub_hour_weather_collection if para == 'weather' else aq_collection if para == 'aq' else None
        records = collection.find({
            'station_code': station_code,
            'time': {
                '$gte': start_time,
                '$lte': end_time
            }
        })
        records = list(records)
        for r in records:
            if "_id" in r:
                del r['_id']
        return records
    
    
    @classmethod
    def query_spatial_temporal_record(self, lat, lon, distance, start_time, end_time):
        
        station_obj = self.find_nearby_stations(lat = lat, lon = lon, distance = distance)
        AQI_stations = station_obj['AQI']
        weather_stations = station_obj['weather']

        weather_records = []
        for station in weather_stations:
            weather_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'weather')  
        weather_aggregation = self.aggregate_records(weather_records, data_type='weather')


        AQI_records = []
        for station in AQI_stations:
            AQI_records += self.find_weather_aq_records(station_code=station, start_time = start_time, end_time = end_time, para = 'aq')
                    
        AQI_aggregation = self.aggregate_records(AQI_records, data_type='aqi')
        return {"AQI": AQI_aggregation, "weather": weather_aggregation}
        
   

    @classmethod
    def query_get_station_config(self, para = 'weather'):
        station_map = {}
        stations = subhour_weather_station if para == 'weather' else aq_station_collection if para == 'aq' else None
        for station in stations.find():
            station_code = station['station_code']
            if '_id' in station:
                del station['_id']
            m_station_code = modify_station_code(station_code)
            
            if m_station_code not in station_map:
                station_map[m_station_code] = station
        return station_map

    @classmethod 
    def query_spatial_temporal_record_by_station_code(self, station_code, distance, start_time, end_time):
        station_config = self.query_get_station_config('aq')
        lat = None
        lon = None
        station_type = None
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'AQI'

        station_config = self.query_get_station_config('weather')
        if station_code in station_config:
            [lon, lat] = station_config[station_code]['loc']
            station_type = 'weather'

        result = self.query_spatial_temporal_record(lat=lat, lon=lon, distance=distance, start_time=start_time, end_time=end_time)

        return result
    
    @classmethod
    def generate_aggregation_collection(self, output_collection_name, time_range = 3600, distance = 5000):
        # from pymongo import MongoClient

        output_collection = fusion_db[output_collection_name]

        output_collection.remove({})

        insert_cache = []
        process_number = 0
        start_time = time.time()
        for record in aq_collection.find().sort('time'):
            aq_station_code = record['station_code']
            current_time = record['time']
            
            # Time range +,- 30min; distance: 5000
            aggregation_result = self.query_spatial_temporal_record_by_station_code(aq_station_code, 5000, current_time - 1800, current_time + 1800)
     
            process_number += 1
            if process_number % 100 == 0:
                print(process_number)

            del record['_id']
            new_record = {}
            for schema in record:
                if schema not in aq_schemas:
                    new_record[schema] = record[schema]
            new_record['station_type'] = 'AQI'
            new_record['aggregation_AQI'] = aggregation_result['AQI']
            new_record['aggregation_weather'] = aggregation_result['weather']
            new_record['station_record'] = record
            insert_cache.append(new_record)
            if len(insert_cache) == 100:
                output_collection.insert_many(insert_cache)
                insert_cache = []
                end_time = time.time()
                print(process_number, end_time - start_time)
                start_time = end_time
        if len(insert_cache) != 0:
            output_collection.insert_many(insert_cache)


In [129]:
if __name__ == "__main__":
    FusionDB.generate_aggregation_collection('aqi_aggregation_hkust_5000_1800_0_subhour', time_range = 3600, distance = 5000)



100
100 0.8684239387512207
200
200 1.474116325378418
300
300 1.3455843925476074
400
400 1.399839162826538
500
500 1.3996379375457764
600
600 1.3703088760375977
700
700 1.3955011367797852
800
800 1.446476936340332
900
900 1.2221684455871582
1000
1000 0.987750768661499
1100
1100 1.1057090759277344
1200
1200 1.2240533828735352
1300
1300 1.2544405460357666
1400
1400 1.2618370056152344
1500
1500 1.3163838386535645
1600
1600 1.4300446510314941
1700
1700 1.3809726238250732
1800
1800 1.3948347568511963
1900
1900 1.4483058452606201
2000
2000 1.179119348526001
2100
2100 1.375117540359497
2200
2200 1.4054393768310547
2300
2300 1.387507677078247
2400
2400 1.3299570083618164
2500
2500 1.2775719165802002
2600
2600 1.354736566543579
2700
2700 1.3511664867401123
2800
2800 1.409775733947754
2900
2900 1.3716001510620117
3000
3000 5.177374601364136
3100
3100 4.113285303115845
3200
3200 1.3656859397888184
3300
3300 1.1862361431121826
3400
3400 1.144331455230713
3500
3500 1.3167695999145508
3600
3600 1.325

27500
27500 1.048729419708252
27600
27600 1.324446201324463
27700
27700 1.384800672531128
27800
27800 1.3439078330993652
27900
27900 1.3442800045013428
28000
28000 1.3742647171020508
28100
28100 1.4150936603546143
28200
28200 1.5735702514648438
28300
28300 0.8815550804138184
28400
28400 0.9261713027954102
28500
28500 1.3393220901489258
28600
28600 1.364042043685913
28700
28700 1.3656253814697266
28800
28800 1.3413217067718506
28900
28900 1.3370964527130127
29000
29000 1.338270902633667
29100
29100 1.2518980503082275
29200
29200 0.9053857326507568
29300
29300 1.2015430927276611
29400
29400 1.32535719871521
29500
29500 1.3440959453582764
29600
29600 1.3389642238616943
29700
29700 1.4089984893798828
29800
29800 1.3581008911132812
29900
29900 1.3423326015472412
30000
30000 1.3087706565856934
30100
30100 1.3767364025115967
30200
30200 1.108124017715454
30300
30300 1.2747836112976074
30400
30400 1.368767261505127
30500
30500 1.359025239944458
30600
30600 1.2817721366882324
30700
30700 1.3635

54200
54200 1.3126518726348877
54300
54300 1.363351821899414
54400
54400 1.3391048908233643
54500
54500 1.3244304656982422
54600
54600 1.331718921661377
54700
54700 1.3434209823608398
54800
54800 1.297435998916626
54900
54900 0.9094204902648926
55000
55000 1.0299115180969238
55100
55100 1.1392152309417725
55200
55200 1.185927391052246
55300
55300 1.080258846282959
55400
55400 1.3381686210632324
55500
55500 1.3554987907409668
55600
55600 1.3234047889709473
55700
55700 1.4225351810455322
55800
55800 1.0190849304199219
55900
55900 0.7937841415405273
56000
56000 1.1051256656646729
56100
56100 1.3547801971435547
56200
56200 1.3533198833465576
56300
56300 1.355855941772461
56400
56400 1.3461358547210693
56500
56500 1.3663256168365479
56600
56600 1.3180928230285645
56700
56700 1.1182835102081299
56800
56800 0.9947962760925293
56900
56900 1.0760712623596191
57000
57000 1.0401124954223633
57100
57100 1.2505919933319092
57200
57200 1.1971099376678467
57300
57300 1.1116082668304443
57400
57400 1.

80900
80900 1.2807672023773193
81000
81000 1.3155577182769775
81100
81100 1.336050271987915
81200
81200 1.2858848571777344
81300
81300 1.3437645435333252
81400
81400 1.1513822078704834
81500
81500 0.9859294891357422
81600
81600 1.1611342430114746
81700
81700 1.2679822444915771
81800
81800 1.3267936706542969
81900
81900 1.2907145023345947
82000
82000 1.290273666381836
82100
82100 1.3430309295654297
82200
82200 1.3517472743988037
82300
82300 1.3958394527435303
82400
82400 0.9216804504394531
82500
82500 1.3110852241516113
82600
82600 1.4626002311706543
82700
82700 1.3078227043151855
82800
82800 1.3215668201446533
82900
82900 1.3806562423706055
83000
83000 1.3508028984069824
83100
83100 1.3539676666259766
83200
83200 1.3439936637878418
83300
83300 0.7693984508514404
83400
83400 0.681471586227417
83500
83500 1.027383804321289
83600
83600 1.270096778869629
83700
83700 1.2803881168365479
83800
83800 1.2783799171447754
83900
83900 1.313983678817749
84000
84000 1.3401329517364502
84100
84100 1.

107100
107100 1.2874641418457031
107200
107200 1.049839973449707
107300
107300 1.1818761825561523
107400
107400 1.211759090423584
107500
107500 1.2656688690185547
107600
107600 1.2587878704071045
107700
107700 1.4337797164916992
107800
107800 0.9526326656341553
107900
107900 1.175140142440796
108000
108000 1.065549373626709
108100
108100 1.2781953811645508
108200
108200 1.375809669494629
108300
108300 1.3221428394317627
108400
108400 1.383638858795166
108500
108500 1.3177409172058105
108600
108600 1.4032905101776123
108700
108700 1.0762989521026611
108800
108800 0.8740987777709961
108900
108900 1.312175989151001
109000
109000 1.3054108619689941
109100
109100 1.2620809078216553
109200
109200 1.2844021320343018
109300
109300 1.3397254943847656
109400
109400 1.3225114345550537
109500
109500 1.2779769897460938
109600
109600 1.4260368347167969
109700
109700 1.0703575611114502
109800
109800 1.0213990211486816
109900
109900 1.3640613555908203
110000
110000 1.18719482421875
110100
110100 1.137