In [7]:
%matplotlib inline 
# above is an iPython magic command that displays plots immediately after the cell where the plot was made
import numpy as np
import pandas as pd
import netCDF4 as nc
import datetime as dt
import pylab as pl
import os
import urllib2
import json
from dateutil.rrule import rrule, MONTHLY

In [3]:
def extract_station_data(filepath, station_id_to_extract, data_keys = {'rainfall_amount','rainfall_amount_qc'}):
    dset = nc.Dataset(fname)
    metadata_keys = dset.variables.keys()[:10]

    # CREATE METADATA DATAFRAME
    metadata = pd.DataFrame(index=dset['station_id'][...], columns=metadata_keys)
    # handle keys with data as string in metadata
    string_keys = ['station_location','station_municipality','station_province','station_region']
    for key in string_keys:
        # the attributes in the list are stored as list of characters, 
        # this step concatenates them into a coherent string and directly fills the dataframe
        metadata[key] = ["".join(data).strip() for data in dset[key][...]]
        # remove these keys because they are already filled in the dataframe
        metadata_keys.remove(key)
    for key in metadata_keys:
        if key not in string_keys:
            metadata[key] = dset[key][...]

    # CREATE DATA DATAFRAME
    df = pd.DataFrame(index=dset['stationIndex'][...].astype(int), columns=data_keys)
    datetimes = [dt.datetime.utcfromtimestamp(t) for t in dset['timestamp'][...]]
    df['timestamp'] = datetimes
    # handle keys with data as string in dataset
    for key in data_keys:
        if key != 'timestamp':
            if key.endswith('_qc'):
                df[key] = ["".join(data).strip() for data in dset[key][...]]
            else:
                df[key] = dset[key][...]

    station_idx = np.where(metadata['station_id'] == station_id_to_extract)[0][0]    
    df_station = df.loc[station_idx]
    # get sampling period for plot label and write into metadata
    sampling_period = metadata.loc[station_id_to_extract]['sampling_period']
    df_station = df_station.reset_index().set_index('timestamp')
    
    return df_station, metadata

In [9]:
fname = '../dataset/agws.01m.v1.201311.nc'
station_id_to_extract = 179
# specify the variables you want to extract
data_keys = ['rainfall_amount','rainfall_amount_qc','surface_air_pressure','surface_air_pressure_qc']

tstart = '2012-01'
tend = '2012-05'