In [8]:
from library import *
import os
import re
import pandas as pd
from pandas import Series
from pandas import DataFrame
import json
from io import StringIO

client = MongoClient('127.0.0.1', 27017)

db = client['air_quality_model_hkust']
sub_hour_weather_collection_name = 'subhour_weather_hkust'
sub_hour_weather_collection = db[sub_hour_weather_collection_name]

In [9]:
def init_collection():
    indexs = []
    if sub_hour_weather_collection_name not in db.collection_names():
        sub_hour_weather_collection.create_index('time')
        sub_hour_weather_collection.create_index([("loc", pymongo.GEOSPHERE)])
    else:
        for index_agg in sub_hour_weather_collection.index_information():
            indexs.append(index_agg.split("_")[0])
        if 'time' not in indexs:
            sub_hour_weather_collection.create_index('time')
        if 'loc' not in indexs:
            sub_hour_weather_collection.create_index([("loc", pymongo.GEOSPHERE)])


In [10]:
def extract_para_and_station(file_path):
    """
    First three lines describe the fuke, fourth line describe schema, others for data
    :param file_path:
    :return:
    """

    with open(file_path) as input:
        # Read weather para, lon and lat from the first line
        line = input.readline()
        line = line.replace('"', '')
        line = re.sub("[\(\[].*?[\)\]]", "", line)
        segs = line.split(',')
        segs = [seg.strip() for seg in segs]
        weather = '_'.join(segs[0].lower().split(' '))
        lat = segs[1].split('=')[1]
        lon = segs[2].split('=')[1]

        # Read station from the second line
        line = input.readline()
        line = line.replace('"', '')
        segs = line.split(':')
        segs = [seg.strip() for seg in segs]
        station_code = segs[1]
        return {'weather': weather, 'lat': lat, 'lon': lon, 'station_code': station_code}


In [11]:
def parser_context_line(line, weather):
    segs = line.split(',')
    segs = [seg.strip() for seg in segs]
    parse_result = {}
    time_stamp = time.strptime(segs[0], "%Y/%m/%d %H:%M:%S")
    time_stamp = time.mktime(time_stamp)
    parse_result['time'] = time_stamp
    if weather != "wind":
        parse_result[weather] = segs[2]
    else:
        parse_result['wind_speed'] = segs[2]
        parse_result['wind_direction'] = segs[3]

    return parse_result

In [12]:
def generate_panda_dataframe(file_path):
    file_config = extract_para_and_station(file_path)
    with open(file_path) as input:

        df = pd.read_csv(StringIO(input.read()), skiprows=4, header=None)  
        if file_config['weather'] != 'wind':
            df = df[[0,2]]
            df.columns = ['time', file_config['weather']]
            df = df.assign(station_code = [file_config['station_code'] for _ in range(len(df))]) 
            
        else:
            df = df[[0,2,3]]
            df.columns = ['time','wind_speed', 'wind_direction']
            df = df.assign(station_code = [file_config['station_code'] for _ in range(len(df))])  
        return df, file_config
            

In [34]:
files = read_all_files()
source_df = DataFrame()
num = 3
station_df_map = {}
station_config_map = {}
for file in files:
    print(file.split('/')[-1])

    df, file_config = generate_panda_dataframe(file)
    station_code = file_config['station_code']
    station_config_map[station_code] = [float(file_config['lon']), float(file_config['lat'])]
    print(station_code)
    if station_code not in station_df_map:
        station_df_map[station_code] = []
    station_df_map[station_code].append(df)
    
print(station_config_map)

SH_RH_221836_1143006.csv
WGL_AWS
SH_RH_222011_1140267.csv
CCH_AWS
SH_RH_222483_1141708.csv
HKS_AWS
SH_RH_222911_1139069.csv
SLW_AWS
SH_RH_223036_1141719.csv
HKO_AWS
SH_RH_223094_1139219.csv
HKA_AWS
SH_RH_223132_1141704.csv
KP_AWS
SH_RH_223158_1142556.csv
JKB_AWS
SH_RH_223353_1142648.csv
USTTB
SH_RH_223369_1142690.csv
USTPR
SH_RH_223370_1142690.csv
USTPR2
SH_RH_223379_1142675.csv
PHSUP
SH_RH_223500_1141067.csv
CPH_AWS
SH_RH_223594_1142153.csv
TC_AWS
SH_RH_223703_1143125.csv
KSC_AWS
SH_RH_223772_1142717.csv
SKG_AWS
SH_RH_223922_1139742.csv
TUN_AWS
SH_RH_224031_1142086.csv
SHA_AWS
SH_RH_224031_1143233.csv
TYW_AWS
SH_RH_224111_1141247.csv
TMS_AWS
SH_RH_224339_1140850.csv
SEK_AWS
SH_RH_224458_1141789.csv
TPO_AWS
SH_RH_224667_1140089.csv
WLP_AWS
SH_RH_224706_1139811.csv
LFS_AWS
SH_RH_225019_1141111.csv
SSH_AWS
SH_RH_225306_1141536.csv
TKL_AWS
SH_RH_225482_1144258.csv
EPC_AWS
SH_RH_221836_1143006.csv
WGL_AWS
SH_RH_222011_1140267.csv
CCH_AWS
SH_RH_222483_1141708.csv
HKS_AWS
SH_RH_222860_113933

In [35]:
for station_code in station_df_map:
    print(station_code, len(station_df_map[station_code]))

WGL_AWS 6
CCH_AWS 6
HKS_AWS 6
SLW_AWS 6
HKO_AWS 6
HKA_AWS 6
KP_AWS 6
JKB_AWS 6
USTTB 2
USTPR 4
USTPR2 2
PHSUP 3
CPH_AWS 6
TC_AWS 6
KSC_AWS 4
SKG_AWS 6
TUN_AWS 6
SHA_AWS 6
TYW_AWS 4
TMS_AWS 6
SEK_AWS 6
TPO_AWS 4
WLP_AWS 6
LFS_AWS 6
SSH_AWS 4
TKL_AWS 6
EPC_AWS 6
TCAWS 2
PHSUP2 5
YLAWS 2
STL_AWS 4
NGP_AWS 4
VP1_AWS 2
HPV_AWS 2
HKP_AWS 2
SKW_AWS 2
SE_AWS 4
KWT_AWS 2
KLC_AWS 2
SSP_AWS 2
WTS_AWS 2
TAP_AWS 4
PLC_AWS 4
KAT_AWS 2
BHD_AWS 2
CCB_AWS 2
TO_AWS 2
YTS_AWS 2
NLS_AWS 2
GI_AWS 2
SF_AWS 2
NP_AWS 2
SHW_AWS 2
R2C_AWS 2
TMT_AWS 2
SC_AWS 2
SHL_AWS 2
TPK_AWS 2
SPAWS 1


In [36]:
station_dfs_merge = {}
for station_code in station_df_map:
    dfs = station_df_map[station_code]
    print(station_code, len(dfs))
    source_df = DataFrame()
    start_time = time.time()
    for temp_df in dfs:
        temp_df = temp_df.drop_duplicates(subset = ['time', 'station_code'])
        if source_df.empty == True:
            source_df = temp_df   
        else:
            source_df = pd.merge(source_df, temp_df, how='outer', on=['time', 'station_code'], suffixes=('_c', '_c'))
            temp_df = temp_df.drop_duplicates(subset = ['time', 'station_code'])
        print(time.time() - start_time)
    station_dfs_merge[station_code] = source_df


WGL_AWS 6
0.030512571334838867
0.09750008583068848
0.1965012550354004
0.2779989242553711
0.37199878692626953
0.45149970054626465
CCH_AWS 6
0.01949906349182129
0.08550071716308594
0.18249964714050293
0.2580227851867676
0.359999418258667
0.4485011100769043
HKS_AWS 6
0.020502090454101562
0.08800148963928223
0.1990036964416504
0.27950048446655273
0.38252758979797363
0.4720020294189453
SLW_AWS 6
0.011527299880981445
0.06049847602844238
0.13449835777282715
0.19299817085266113
0.2669999599456787
0.31099915504455566
HKO_AWS 6
0.023029804229736328
0.09349966049194336
0.19352340698242188
0.2720015048980713
0.33099937438964844
0.386000394821167
HKA_AWS 6
0.020998001098632812
0.10299992561340332
0.20799970626831055
0.2904982566833496
0.38199830055236816
0.4334986209869385
KP_AWS 6
0.022501230239868164
0.1075286865234375
0.23450088500976562
0.31400299072265625
0.39300060272216797
0.4525306224822998
JKB_AWS 6
0.01797175407409668
0.08247256278991699
0.18297171592712402
0.25997161865234375
0.362002611

In [20]:
deduplicate_dfs_map = {}
for station_code in station_dfs_merge:

    agg_df = station_dfs_merge[station_code]

    agg_df = agg_df.groupby(agg_df.columns, axis=1).max()
    
    for c in agg_df.columns:
        if c[-2:] == '_c':
            agg_df = agg_df.rename(columns = {c: c[:-2]})
    print(station_code, list(agg_df.columns))
    deduplicate_dfs_map[station_code] = agg_df


WGL_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
CCH_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
HKS_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
SLW_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
HKO_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
HKA_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
KP_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
JKB_AWS ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
USTTB ['relative_humidity', 'station_code', 'temperature', 'time']
USTPR ['relative_humidity', 'station_code', 'temperature', 'time', 'wind_direction', 'wind_speed']
USTPR2 ['relative_humidity'

In [44]:
# odo('accounts.csv', pd.DataFrame)

for station_code in deduplicate_dfs_map:
    print('Processing', station_code, 'of', len(deduplicate_dfs_map))
    current_df = deduplicate_dfs_map[station_code]
    current_df['time'] = current_df['time'].apply(lambda t:  time.mktime(time.strptime(t, "%Y/%m/%d %H:%M:%S")) if type(t) == str else t)
    current_df = current_df.assign(loc = [station_config_map[station_code] for _ in range(len(current_df))]) 
    
    current_df_T = current_df.T
    current_df_T_dict = current_df_T.to_dict()
    dict_arr = []
    for key in current_df_T_dict:
        dict_arr.append(current_df_T_dict[key])
    sub_hour_weather_collection.insert_many(dict_arr)


Processing WGL_AWS of 59
Processing CCH_AWS of 59
Processing HKS_AWS of 59
Processing SLW_AWS of 59
Processing HKO_AWS of 59
Processing HKA_AWS of 59
Processing KP_AWS of 59
Processing JKB_AWS of 59
Processing USTTB of 59
Processing USTPR of 59
Processing USTPR2 of 59
Processing PHSUP of 59
Processing CPH_AWS of 59
Processing TC_AWS of 59
Processing KSC_AWS of 59
Processing SKG_AWS of 59
Processing TUN_AWS of 59
Processing SHA_AWS of 59
Processing TYW_AWS of 59
Processing TMS_AWS of 59
Processing SEK_AWS of 59
Processing TPO_AWS of 59
Processing WLP_AWS of 59
Processing LFS_AWS of 59
Processing SSH_AWS of 59
Processing TKL_AWS of 59
Processing EPC_AWS of 59
Processing TCAWS of 59
Processing PHSUP2 of 59


ValueError: Buffer has wrong number of dimensions (expected 1, got 0)

In [None]:
data = {'k1':[1,2,3,4,5],'name':['a','b','c','d','e'], 'mark': [11,22,33,44,55], 'loc': [[1,1],[2,1],[3,1],[4,1],[5,1]]}
data2 = {'k1':[1,2,3,6,7],'name':['a','b','c','d','e'], 'mark': [111,122,133,144,155], 'loc': [[1,1],[2,2],[2,3],[4,3],[5,3]]}
data3 = {'k1':[1,2,3,6,7],'name':['a','b','c','d','e'], 'mark': [211,222,233,244,255]}
df1 = DataFrame(data)
df2 = DataFrame(data2)
df3 = DataFrame(data3)
print(df1)
print('\n')
print(df2)
print('\n')
print(df3)

In [None]:
test = pd.merge(df1, df2, how='outer', on=['k1', 'name'], suffixes=('_1', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], copy = True, suffixes=('_1', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], copy = True, suffixes=('_1', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], copy = True, suffixes=('_1', '_1'))
# merge_df = pd.merge(merge_df, df3, how='outer', on=['k1', 'name'], copy = True, suffixes=('_', '_'))

# for c in merge_df.columns:
#     if c[-2:] == '_c':
#         merge_df = merge_df.rename(columns = {c: c[:-2]})
# print(' ')
# print(merge_df)
test


In [None]:
from odo import odo

In [None]:
data = {'k1':[1,2,3,4,5],'name':['a','b','c','d','e'], 'mark': [11,22,33,44,55]}
data2 = {'k1':[1,2,3,6,7],'name':['a','b','c','d','e'], 'mark': [111,122,133,144,155]}
data3 = {'k1':[1,2,3,6,7],'name':['a','b','c','d','e'], 'mark': [211,222,233,244,255]}
df1 = DataFrame(data)
df2 = DataFrame(data2)
df3 = DataFrame(data3)
test = pd.merge(df1, df2, how='outer', on=['k1', 'name'], suffixes=('', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], suffixes=('', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], suffixes=('', '_1'))
test = pd.merge(test, df3, how='outer', on=['k1', 'name'], suffixes=('', '_1'))

In [None]:
test.columns

In [None]:
# def maxx(arr):
#     m = -99999
#     for a in arr:
#         m = a if a > m else m
#     return m
test = test.groupby(test.columns, axis=1).max()

In [None]:
test.T


In [41]:
s = '123'
s2 = 123
type(s) == str
print(type(s2) == float)

False
