In [None]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
%matplotlib inline

In [None]:
node1 = 'sonycnode-b827eb86d458.sonyc'
node3 = 'sonycnode-b827ebb40450.sonyc'
node4 = 'sonycnode-b827eb73e772.sonyc'

path1 = f'../sonyc/spl/2017/{node1}.h5'
path3 = f'../sonyc/spl/2017/{node3}.h5'
path4 = f'../sonyc/spl/2017/{node4}.h5'

In [None]:
train_start, train_end = pd.Timestamp('2017-04-01'), pd.Timestamp('2017-07-01')
test_start, test_end = pd.Timestamp('2017-07-01'), pd.Timestamp('2017-08-01')

In [None]:
def load_spl_data(path):
    df = pd.read_hdf(path, key='minute_intervals')
    df = df[df['timestamp'] < 1514764800] # remove any 2018 values
    df['datetime[utc]'] = pd.to_datetime(df['timestamp'], unit='s')
    
    return df[['datetime[utc]', 'dBAS_mean']]

def reduce_spl_data(df, start, end):
    # reduced data to mean of a given hourly interval
    reduced_df = df[
        (start <= df['datetime[utc]']) & 
        (df['datetime[utc]'] < end) &
        (df['datetime[utc]'].dt.minute == 51)
    ].reset_index(drop=True)
    
    grouped_df = reduced_df.groupby([
        reduced_df['datetime[utc]'].dt.month,
        reduced_df['datetime[utc]'].dt.day,
        reduced_df['datetime[utc]'].dt.hour]
    ).mean()
    
    new_date_col = (grouped_df
        .index
        .to_series()
        .apply(lambda t: pd.Timestamp(*((2017,) + t + (51,))))
        .reset_index(drop=True))
    
    grouped_df.reset_index(drop=True, inplace=True)
    grouped_df['datetime[utc]'] = new_date_col

    return grouped_df[['datetime[utc]', 'dBAS_mean']]

# load spl stuff
spl1_df = load_spl_data(path1)
reduced_spl1_df = reduce_spl_data(spl1_df, train_start, test_end)

spl3_df = load_spl_data(path3)
reduced_spl3_df = reduce_spl_data(spl3_df, train_start, test_end)

spl4_df = load_spl_data(path4)
reduced_spl4_df = reduce_spl_data(spl4_df, train_start, test_end)

In [None]:
# load weather stuff
weather_df = pd.read_csv(
    '../data/weather.csv', 
    usecols=['datetime[utc]', 'precipitation[mm]'], 
    parse_dates=['datetime[utc]']
)

reduced_weather_df = weather_df[
    (train_start <= weather_df['datetime[utc]']) & 
    (weather_df['datetime[utc]'] < test_end) &
    (weather_df['datetime[utc]'].dt.minute == 51) # only using this rolling window
].reset_index(drop=True)

In [None]:
# TODO: verify later that each hour is also represented
def merge_dfs(weather_df, spl_df):
    working_df = pd.merge(weather_df, spl_df, how='outer', on='datetime[utc]').sort_values(by='datetime[utc]')

    if any(working_df['precipitation[mm]'].isna()):
        print(f"{len(working_df[working_df['precipitation[mm]'].isna()])} NaNs in 'precipitation[mm],' replacing them with 0.0")
        working_df.loc[working_df[working_df['precipitation[mm]'].isna()].index, 'precipitation[mm]'] = 0.0

    if any(working_df['dBAS_mean'].isna()):
        avg = working_df['dBAS_mean'].dropna().mean()
        print(f"{len(working_df[working_df['dBAS_mean'].isna()])} NaNs in 'dBAS_mean,' replacing them with {avg}")
        working_df.loc[working_df[working_df['dBAS_mean'].isna()].index, 'dBAS_mean'] = avg
        
    # add rained class
    working_df['rained'] = (working_df['precipitation[mm]'] > 0.00).astype(int)
        
    return working_df

In [None]:
merged_df1 = merge_dfs(reduced_weather_df, reduced_spl1_df)
merged_df3 = merge_dfs(reduced_weather_df, reduced_spl3_df)
merged_df4 = merge_dfs(reduced_weather_df, reduced_spl4_df)

In [None]:
merged_all_df = merged_df1.copy()
merged_all_df.rename(columns={'dBAS_mean': 'dBAS_mean_01'}, inplace=True)
merged_all_df['dBAS_mean_03'] = merged_df3['dBAS_mean']
merged_all_df['dBAS_mean_04'] = merged_df4['dBAS_mean']

In [None]:
# assumption: time independent
def sep_train_test_data(merged_df, training_int, testing_int):
    train_start, train_end = training_int
    test_start, test_end = testing_int
    
    training_cla_df = merged_df[
        (train_start <= merged_df['datetime[utc]']) & 
        (merged_df['datetime[utc]'] < train_end)
    ][['rained', 'dBAS_mean']]

    testing_cla_df = merged_df[
        (test_start <= merged_df['datetime[utc]']) & 
        (merged_df['datetime[utc]'] < test_end)
    ][['rained', 'dBAS_mean']]
    
    X_train, y_train = (
    training_cla_df['dBAS_mean'].to_numpy().reshape(-1, 1), 
    training_cla_df['rained'].to_numpy()
)
    X_test, y_test = (
        testing_cla_df['dBAS_mean'].to_numpy().reshape(-1, 1), 
        testing_cla_df['rained'].to_numpy()
    )
    
    return (X_train, y_train), (X_test, y_test)

In [None]:
training_data1, testing_data1 = sep_train_test_data(merged_df1, (train_start, train_end), (test_start, test_end))

In [None]:
training_cla_df = merged_all_df[
    (train_start <= merged_all_df['datetime[utc]']) & 
    (merged_all_df['datetime[utc]'] < train_end)
][['rained', 'dBAS_mean_01', 'dBAS_mean_03', 'dBAS_mean_04']]

testing_cla_df = merged_all_df[
    (test_start <= merged_all_df['datetime[utc]']) & 
    (merged_all_df['datetime[utc]'] < test_end)
][['rained', 'dBAS_mean_01', 'dBAS_mean_03', 'dBAS_mean_04']]

training_data = (
training_cla_df[['dBAS_mean_01', 'dBAS_mean_03', 'dBAS_mean_04']].to_numpy(),#.reshape(-1, 1), 
training_cla_df['rained'].to_numpy()
)
testing_data = (
    testing_cla_df[['dBAS_mean_01', 'dBAS_mean_03', 'dBAS_mean_04']].to_numpy(),#.reshape(-1, 1), 
    testing_cla_df['rained'].to_numpy()
)

### Logistic Regression Classification Node 1

In [None]:
clf_logistic = LogisticRegression()
clf_logistic.fit(*training_data1)

In [None]:
clf_logistic.score(*testing_data1)

In [None]:
print(classification_report(testing_data1[1], clf_logistic.predict(testing_data1[0])))

### Logistic Regression Classification Nodes All

In [None]:
clf_logistic = LogisticRegression()
clf_logistic.fit(*training_data)

In [None]:
clf_logistic.score(*testing_data)

In [None]:
print(classification_report(testing_data[1], clf_logistic.predict(testing_data[0])))

### SVM Classification Node 1

In [None]:
svm = SVC(gamma='auto', class_weight='balanced')
param_grid = {
    'C': list(range(10, 31)),
}
clf_svm = GridSearchCV(svm, param_grid, cv=5)
clf_svm.fit(*testing_data1)

In [None]:
clf_svm.best_params_

In [None]:
clf_svm.score(*training_data1)

In [None]:
print(classification_report(testing_data1[1], clf_svm.predict(testing_data1[0])))

### SVM Classification Nodes All

In [None]:
svm = SVC(gamma='auto', class_weight='balanced')
param_grid = {
    'C': list(range(10, 51)),
}
clf_svm = GridSearchCV(svm, param_grid, cv=5)
clf_svm.fit(*training_data)

In [None]:
clf_svm.best_params_

In [None]:
clf_svm.score(*training_data)

In [None]:
print(classification_report(testing_data[1], clf_svm.predict(testing_data[0])))

### Random Forest Classification Node 1

In [None]:
rf = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [2, 4, 8, 16, 32],
    'n_estimators': [100, 200, 300, 400]
}
clf_rf = GridSearchCV(rf, param_grid, cv=5)
clf_rf.fit(*training_data1)

In [None]:
clf_rf.score(*testing_data1)

In [None]:
clf_rf.best_params_

In [None]:
print(classification_report(testing_data1[1], clf_rf.predict(testing_data1[0])))

### Random Forest Classification Nodes All

In [None]:
rf = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [2, 4, 8, 16],
    'n_estimators': [100, 200]
}
clf_rf = GridSearchCV(rf, param_grid, cv=5)
clf_rf.fit(*training_data)

In [None]:
clf_rf.score(*testing_data)

In [None]:
clf_rf.best_params_

In [None]:
print(classification_report(testing_data[1], clf_rf.predict(testing_data[0])))