In [4]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 93 kB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10


In [1]:

import datetime
import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
DATA_DIR = os.path.join("../data")

In [6]:
fp = os.path.join(DATA_DIR, "hydrograph-excel-sheet-tp-cleaned.xlsx")
xl = pd.ExcelFile(fp)
gages = xl.sheet_names
hydro_data = {s: xl.parse(s) for s in gages}

In [7]:
def flatten_sheet(sheet_name: str, src_data: dict):
    src_df = src_data[sheet_name]
    
    # Check lengths of columns, some contain only notes so will be
    # much less than 100 and need to be dropped. Most columns
    # should have 365/366 values but a few are missing and need to be filled.
    col_lengths = {c: sum(src_df[c].notna()) for c in src_df.columns}
    keep_cols = [c for c, l in col_lengths.items() if l > 100]
    
    # Check columns are all in the correct order to combine:
    assert "time" in keep_cols[0].lower()
    correct_order = {"time": "ft", "ft": "discharge", "discharge": "time"}
    for i, col in enumerate(keep_cols[:-1]):
        next_col = keep_cols[i+1]
        for key in correct_order.keys():
            if key in col.lower():
                should_be = correct_order[key]
                assert should_be in next_col.lower(), sheet_name
    
    # Iterate through columns and collect data:
    data_subsets = list()
    for start_col in range(0, len(keep_cols), 3):
        df_columns = keep_cols[start_col: start_col+3]
        subset = src_df[df_columns]
        rename = dict(zip(subset.columns, ["time", "ft", "m3"]))
        subset = subset.rename(columns=rename).dropna(how="all")
        data_subsets.append(subset)
        
    # Combine to a single df:
    final =  pd.concat(data_subsets).reset_index(drop=True)
    final["gage"] = sheet_name
    return final

In [8]:
all_sheets = list()
for sname in gages:
    all_sheets.append(flatten_sheet(sname, hydro_data)) 
df = pd.concat(all_sheets).reset_index(drop=True)

In [9]:
df.shape

(83274, 4)

In [10]:
df.head()

Unnamed: 0,time,ft,m3,gage
0,1984-10-01,54.0,1.52911,11402000
1,1984-10-02,52.0,1.472476,11402000
2,1984-10-03,49.0,1.387525,11402000
3,1984-10-04,49.0,1.387525,11402000
4,1984-10-05,48.0,1.359209,11402000


In [11]:
# convert column gage to numeric column
df['gage'] = pd.to_numeric(df['gage'])
df['gage'].value_counts()
# Total of 7 gages

11402000    12418
11318500    12418
11266500    12418
11208000    12053
11185500    12053
11189500    11322
11202710    10592
Name: gage, dtype: int64

## Join table between new gage lat lon (gage_id_loation.csv') and gage discharge

In [13]:
# 1. read gage lat lon data
gage_loc = pd.read_csv(os.path.join(DATA_DIR,'gage_id_location.csv'))
gage_loc.head()

Unnamed: 0,gage,ll_lon,ll_lat,tr_lon,tr_lat
0,11185500.0,-118.64478,35.85291,-118.209481,36.700635
1,11189500.0,-118.383732,35.728555,-118.003533,36.437843
2,11202710.0,-118.72619,36.085359,-118.5279,36.325132
3,11208000.0,-118.818577,36.520114,-118.610906,36.677516
4,11266500.0,-119.675871,37.593748,-119.257278,37.902601


In [15]:
# 2. Join Data
gage_data = df.merge(gage_loc,on = 'gage',how = 'inner')
gage_data.head()

Unnamed: 0,time,ft,m3,gage,ll_lon,ll_lat,tr_lon,tr_lat
0,1984-10-01,54.0,1.52911,11402000,-121.157674,39.855478,-120.690823,40.049659
1,1984-10-02,52.0,1.472476,11402000,-121.157674,39.855478,-120.690823,40.049659
2,1984-10-03,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659
3,1984-10-04,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659
4,1984-10-05,48.0,1.359209,11402000,-121.157674,39.855478,-120.690823,40.049659


In [16]:
# save csv file
gage_data.to_csv(os.path.join(DATA_DIR,'gage_discharge_lat_lon.csv'),index=False)

In [17]:
# -----------------------------------------------------------------------------

## Join table between old gage lat lon (target_gages.csv') and gage discharge

In [11]:
# 1. read gage lat lon data
gage_loc = pd.read_csv(os.path.join(DATA_DIR,'target_gages.csv'))

In [17]:
# 2. take subset of the data
lat_lon = gage_loc[['site_no','dec_lat_va','dec_long_va']]
lat_lon.columns = ['gage','lat','lon']
lat_lon.head()

Unnamed: 0,gage,lat,long
0,11185500,35.905504,-118.467586
1,11189500,35.737452,-118.173689
2,11202710,36.161336,-118.709536
3,11208000,36.52189,-118.799265
4,11266500,37.716871,-119.666279


In [29]:
df['gage'].dtype

dtype('int64')

In [33]:
# 3. Join Data
gage_data = df.merge(lat_lon,on = 'gage',how = 'inner')
gage_data.head()

Unnamed: 0,time,ft,m3,gage,lat,long
0,1984-10-01,54.0,1.52911,11402000,40.002947,-120.954399
1,1984-10-02,52.0,1.472476,11402000,40.002947,-120.954399
2,1984-10-03,49.0,1.387525,11402000,40.002947,-120.954399
3,1984-10-04,49.0,1.387525,11402000,40.002947,-120.954399
4,1984-10-05,48.0,1.359209,11402000,40.002947,-120.954399


In [34]:
# save csv file
gage_data.to_csv('gage_discharge_lat_lon.csv',index=False)