In [129]:
import os
import pickle

import pandas as pd
import numpy as np

from pathlib import Path

import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# import matplotlib.pyplot as plt

In [128]:
model_base_path = "/data/models/precipitation"

In [88]:
! pip install chart-studio plotly plotly-geo
# ! pip install plotly
# ! pip uninstall -y chart-studio plotly
# !pip install plotly==3.10.0



In [89]:
!ls /data/stations

benque_prec.csv			 hawkesworth_bridge_wtlv_min.csv
central_farm_prec.csv		 santa_elena_prec.csv
chaa_creek_prec.csv		 santa_elena_wtlv1.csv
hawkesworth_bridge_prec.csv	 santa_elena_wtlv_max.csv
hawkesworth_bridge_wtlv1.csv	 santa_elena_wtlv_min.csv
hawkesworth_bridge_wtlv_max.csv


In [90]:
df_all = None
for f in Path('/data/stations').rglob('*_prec*.csv'):
    
    if "benque" in str(f):
        continue
    
    station = os.path.basename(f).split('_prec')[0]
    print(station)
    _df = pd.read_csv(f, parse_dates=["datetime", "updated_at"])
    _df['station'] = station
    if df_all is None:
        df_all = _df
    else:
        df_all = pd.concat([df_all, _df], ignore_index=True)

central_farm
chaa_creek
hawkesworth_bridge
santa_elena


In [91]:
df_all.drop(columns=['updated_at', 'variable_id'], inplace=True)

In [92]:
df_all.head()

Unnamed: 0,datetime,measured,station_id,station
0,2020-04-19 02:00:00+00:00,0.0,169,central_farm
1,2020-04-19 02:05:00+00:00,0.0,169,central_farm
2,2020-04-19 02:10:00+00:00,0.0,169,central_farm
3,2020-04-19 02:15:00+00:00,0.0,169,central_farm
4,2020-04-19 02:20:00+00:00,0.0,169,central_farm


In [93]:
df_day = df_all.groupby([pd.Grouper(key="datetime", freq="1d"), "station", "station_id"]).sum()

In [94]:
df_day.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,measured
datetime,station,station_id,Unnamed: 3_level_1
2020-04-19 00:00:00+00:00,central_farm,169,0.0
2020-04-20 00:00:00+00:00,central_farm,169,0.0
2020-04-21 00:00:00+00:00,central_farm,169,0.0
2020-04-22 00:00:00+00:00,central_farm,169,0.0
2020-04-23 00:00:00+00:00,central_farm,169,0.0


In [95]:
df_day.reset_index(inplace=True)

In [96]:
fig = px.bar(df_day, 
             x="datetime", 
             y='measured', 
             color='station',
             barmode='group',
             title='Time Series with Range Slider and Selectors')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=7, label="7d", step="day", stepmode="backward"),
            dict(count=10, label="10d", step="day", stepmode="backward"),
            dict(count=14, label="14d", step="day", stepmode="backward"),            
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

In [97]:
df_day.groupby(['station']).agg({'datetime':[np.min, np.max]})

Unnamed: 0_level_0,datetime,datetime
Unnamed: 0_level_1,amin,amax
station,Unnamed: 1_level_2,Unnamed: 2_level_2
central_farm,2020-04-19 00:00:00+00:00,2020-11-30 00:00:00+00:00
chaa_creek,2020-05-04 00:00:00+00:00,2020-11-30 00:00:00+00:00
hawkesworth_bridge,2020-06-24 00:00:00+00:00,2020-11-30 00:00:00+00:00
santa_elena,2020-06-26 00:00:00+00:00,2020-11-30 00:00:00+00:00


In [98]:
data = df_day[['station', 'measured', 'datetime']].pivot(index='datetime', columns='station', values='measured')

In [99]:
data.columns

Index(['central_farm', 'chaa_creek', 'hawkesworth_bridge', 'santa_elena'], dtype='object', name='station')

In [100]:
# remove benque because it has some many bad data
# data = data.drop(columns=['benque'])

In [101]:
# dropna 

print(f"data before dropna: shape={data.shape}")
data.dropna(inplace=True)
print(f"data after  dropna: shape={data.shape}")

data before dropna: shape=(226, 4)
data after  dropna: shape=(158, 4)


In [102]:
data.sort_index(inplace=True)

In [103]:
data

station,central_farm,chaa_creek,hawkesworth_bridge,santa_elena
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-26 00:00:00+00:00,2.8,0.0,0.4,0.000
2020-06-27 00:00:00+00:00,7.4,0.2,3.2,4.572
2020-06-28 00:00:00+00:00,0.0,1.4,0.0,0.000
2020-06-29 00:00:00+00:00,1.2,0.0,0.0,0.000
2020-06-30 00:00:00+00:00,5.2,2.2,1.0,6.604
...,...,...,...,...
2020-11-26 00:00:00+00:00,10.2,5.4,21.2,30.480
2020-11-27 00:00:00+00:00,0.2,0.2,0.2,0.000
2020-11-28 00:00:00+00:00,0.0,0.0,0.0,0.000
2020-11-29 00:00:00+00:00,0.0,0.0,5.0,0.508


In [111]:
fig = go.Figure(data=[
    go.Bar(name='avg', x=data.index, y=data.avg),
    go.Bar(name='santa_elena', x=data.index, y=data.santa_elena, text=data.label),
#     go.Bar(name='santa_elena', x=data.index, y=data.santa_elena),
    go.Bar(name='central_farm', x=data.index, y=data.central_farm),
    go.Bar(name='hawkesworth_bridge', x=data.index, y=data.hawkesworth_bridge),
    go.Bar(name='chaa_creek', x=data.index, y=data.chaa_creek)
])

# Change the bar mode
fig.update_layout(barmode='group')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=7, label="7d", step="day", stepmode="backward"),
            dict(count=10, label="10d", step="day", stepmode="backward"),
            dict(count=14, label="14d", step="day", stepmode="backward"),            
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

fig.show()

In [121]:
def split_df(df, x_columns, y_column, day):
    
    df = df.reset_index()
    
    df_slice = df[df.datetime.dt.date < day] 
    X_train = df_slice[x_columns]
    y_train = df_slice[y_column]
    
    df_slice = df[df.datetime.dt.date >= day] 
    X_test = df_slice[x_columns]
    y_test = df_slice[y_column]
    
    print(f"X_train={X_train.shape}, X_test={X_test.shape}, y_train={y_train.shape}, y_test={y_test.shape}")
    
    return X_train, X_test, y_train, y_test

In [107]:
def create_data_classification(df, columns, target, threshold):
    
    columns = columns[columns != target]
    
    df['avg'] = df[columns].mean(axis=1)
    
    df.loc[df[target] >= df['avg'] * (1 + threshold), 'label'] = 0
    df.loc[df[target] <  df['avg'] * (1 + threshold), 'label'] = 1
    
    df = df.astype({'label': np.int})
    
    print(df.head())
    return df
    
data = create_data_classification(data, data.columns.values, 'santa_elena', 0.8)

station                    central_farm  chaa_creek  hawkesworth_bridge  \
datetime                                                                  
2020-06-26 00:00:00+00:00           2.8         0.0                 0.4   
2020-06-27 00:00:00+00:00           7.4         0.2                 3.2   
2020-06-28 00:00:00+00:00           0.0         1.4                 0.0   
2020-06-29 00:00:00+00:00           1.2         0.0                 0.0   
2020-06-30 00:00:00+00:00           5.2         2.2                 1.0   

station                    santa_elena       avg  label  
datetime                                                 
2020-06-26 00:00:00+00:00        0.000  1.066667      1  
2020-06-27 00:00:00+00:00        4.572  3.600000      1  
2020-06-28 00:00:00+00:00        0.000  0.466667      1  
2020-06-29 00:00:00+00:00        0.000  0.400000      1  
2020-06-30 00:00:00+00:00        6.604  2.800000      0  


In [112]:
data.columns

Index(['central_farm', 'chaa_creek', 'hawkesworth_bridge', 'santa_elena',
       'avg', 'label'],
      dtype='object', name='station')

In [122]:
X_train, X_test, y_train, y_test = split_df(data, 
                                            ['central_farm', 'chaa_creek', 'hawkesworth_bridge', 'santa_elena'],
                                            'label',
                                            datetime(2020, 11, 1).date())

X_train=(128, 4), X_test=(30, 4), y_train=(128,), y_test=(30,)


In [126]:
clf = RandomForestClassifier(max_depth=10, n_estimators=300)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [127]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6666666666666666


In [130]:
filename_model = f'{model_base_path}/rf_001.pickle'
model_dir = os.path.dirname(filename_model)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
with open(filename_model, "wb") as f:
    pickle.dump(clf, f)