### Goal: Try again on estimation on region
Based on all the features we get this time.

In [None]:
import tensorflow as tf;
import numpy as np;
import matplotlib.pyplot as plt;
print(tf.__version__);
import pandas as pd;

from sklearn.preprocessing import StandardScaler;
from sklearn.preprocessing import MinMaxScaler;
from sklearn.metrics import mean_squared_error;

import datetime;

In [None]:
pd.set_option('display.max_columns', 300);
pd.set_option('display.max_rows', 300);
tf.keras.backend.set_floatx('float64');

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

In [None]:
MONTHS = 60;
SPLIT = 48; # 2015-2018: training, 2019: testing.
# BATCH_SIZE = 19; # used in NN_v1
BATCH_SIZE = 24;
WINDOW_SIZE = 3;

TEST_LENGTH = MONTHS - SPLIT;

### Data preparation

In [None]:
multi_data = pd.read_csv('../data/full_dataset_unscaled.csv', index_col=0);
zip_ids = multi_data.index.unique();

multi_data.drop(["City", "State", "Metro", "CountyName", "year", "month", "datetime"],\
                 axis = 1, inplace = True);

multi_data.head()

In [None]:
# from google.colab import drive 
# drive.mount('/content/gdrive')

# multi_data = pd.read_csv('gdrive/My Drive/full_dataset_unscaled.csv', index_col=0);
# zip_ids = multi_data.index.unique();

# multi_data.drop(["City", "State", "Metro", "CountyName", "year", "month", "datetime"],\
#                  axis = 1, inplace = True);

# multi_data.head()

In [None]:
FEATURES = multi_data.shape[1] - 1;

feature_name = list(multi_data.columns);

In [None]:
zone_dict = {
    "zone_1": ["MA", "CT", "RI", "DC", "NJ", "ME", "NH"],
    "zone_2": ["NY"],
    "zone_3": ["FL", "GA", "NC", "TN", "SC", "KY", "MO", "AR"],
    "zone_4": ["CA", "WA", "OR", "AK", "HI"],
    "zone_5": ["TX", "CO", "AZ", "KS", "UT", "NV", "MT", "OK", "ND", "NM"],
    "zone_6": ["IL", "PA", "VA", "OH", "WI", "MD", "MN", "IN", "MI", "WV", "IA"]
};

In [None]:
# In our first try, just look at the zip codes in NY. zip:10001-14905
multi_zip = zip_ids;
# multi_zip = list(multi_data[(multi_data.index >= 10001) & (multi_data.index <= 11000)].index.unique());
print(len(multi_zip))

### Utility functions

In [None]:
@tf.autograph.experimental.do_not_convert
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    dataset = tf.data.Dataset.from_tensor_slices(series); #(43,)
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True);
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1)); #(13,43)
    dataset = dataset.shuffle(shuffle_buffer)\
                     .map(lambda window: (window[:-1, 1:], window[-1][0]));
    dataset = dataset.batch(batch_size);
    return dataset;

In [None]:
def plot_series(time, series, format="-", start=0, end=None):
    plt.plot(time[start:end], series[start:end], format)
    plt.xlabel("Time Frame")
    plt.ylabel("ZRI")
    plt.grid(True)

### Neural network center

In [None]:
def NN_dataprep(multi):
    cities_stats = {};
    dataset = windowed_dataset(np.zeros((1, FEATURES+1)), WINDOW_SIZE, BATCH_SIZE, 60);
    validset = windowed_dataset(np.zeros((1, FEATURES+1)), WINDOW_SIZE, BATCH_SIZE, 60);

    #for zip_num in multi_data["zip"].unique():
    for zip_num in multi:
        test = multi_data[multi_data.index == zip_num];
        single_city_series = np.array(test);

        scaler = MinMaxScaler();
        
        single_city_transformed = scaler.fit_transform(single_city_series)
        
        cities_stats[zip_num] = {"min": scaler.data_min_[0],
                                 "range": scaler.data_range_[0]};

        single_city_train = single_city_transformed[:SPLIT];
        single_city_test = single_city_transformed[SPLIT-WINDOW_SIZE:];

        cityset = windowed_dataset(single_city_train, WINDOW_SIZE, BATCH_SIZE, 60);
        cityvalid = windowed_dataset(single_city_test, WINDOW_SIZE, BATCH_SIZE, 60);

        dataset = dataset.concatenate(cityset);
        validset = validset.concatenate(cityvalid);
        
    dataset = dataset.prefetch(1);
    validset = validset.prefetch(1);
        
    return dataset, validset, cities_stats;

In [None]:
def NN_model(dataset, termination=0, test=None):
    tf.keras.backend.clear_session();
    # dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)

    class myCallbacks(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs={}):
            mse = logs.get("mse");
            if(mse < termination):
                print("\nGot an mse at {:.4f} in epoch {} and stopped training\n".format(mse, epoch));
                self.model.stop_training = True;
            
    callback = myCallbacks();
    
    model = tf.keras.models.Sequential([
#         tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1),
#                           input_shape=[None]),
#       tf.keras.layers.Conv1D(filters=32, kernel_size=3,
#                           strides=1, padding="causal",
#                           activation="relu",
#                           input_shape=[None, WINDOW_SIZE, FEATURES+1]),
        tf.keras.layers.LSTM(32, return_sequences=False,
                             input_shape = [WINDOW_SIZE, FEATURES]),
        #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
        # tf.keras.layers.SimpleRNN(8, return_sequences=True),
        #  tf.keras.layers.SimpleRNN(16, return_sequences=True),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dropout(0.2),
    #   tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1)
        # tf.keras.layers.Lambda(lambda x: x * 2.0)
    ]);
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=3e-4, momentum=0.9)
    model.compile(loss=tf.keras.losses.Huber(),
                  optimizer="adam",
                  metrics=["mae", "mse"])

    model.build((None,WINDOW_SIZE,FEATURES))
    model.summary()

    if not test: history = model.fit(dataset, epochs=20, callbacks = [callback], verbose = 1);
    else: history = model.fit(dataset, epochs=20, validation_data=test,\
                              callbacks=[callback], verbose = 1);
    return model;

In [None]:
def NN_forecast(model, series_transformed):
    forecast = []
    results = []
    for time in range(MONTHS - WINDOW_SIZE):
        forecast.append(model.predict(series_transformed[np.newaxis, time:time + WINDOW_SIZE, 1:]))

    results = [float(x[-1][0]) for x in forecast];
    actual = list(series_transformed[WINDOW_SIZE:, 0]);

    return results, actual;

In [None]:
@tf.autograph.experimental.do_not_convert
def NN_group_test(multi, termination=0, plot=False):
    '''
    Input: A list of zip codes
    Output: the RMSE of a NN model on the predicted train, partially predicted test, and complete predicted test.
    '''
    # Collection of data
    dataset_train, dataset_test, city_stats = NN_dataprep(multi);
        
#     for x, y in dataset_train:
#         print(np.array(x).shape, np.array(y).shape);

    model = NN_model(dataset_train, termination, dataset_test);
    model.save('gdrive/My Drive/NN_model_{0:%Y-%m-%d %H-%M-%S}.h5'.format(datetime.datetime.now()));
    
    score_dict = {"zip":[], "RMSE_train":[], "RMSE_test":[], "forecast":[]};

    time_train = list(range(SPLIT));
    time_test = list(range(SPLIT, MONTHS));
    time_actual = range(WINDOW_SIZE, MONTHS);
    
    for zip_num in multi:
        scaler = MinMaxScaler();
        series = np.array(multi_data[multi_data.index == zip_num]);
        series_transformed = scaler.fit_transform(series);
    
        # Forecasting
        results, actual = NN_forecast(model, series_transformed);

        # Compute MSE
        M_train = mean_squared_error(actual[:-TEST_LENGTH], results[:-TEST_LENGTH])**0.5 * scaler.data_range_[0];
        M_test = mean_squared_error(actual[-TEST_LENGTH:], results[-TEST_LENGTH:])**0.5 * scaler.data_range_[0];
    
        if plot: # If the plot option is selected, plot the graph.
            plt.figure(figsize=(10, 6))
            plot_series(time_actual, np.array(actual)*scaler.data_range_[0]+scaler.data_min_[0]);
            plot_series(time_actual, np.array(results)*scaler.data_range_[0]+scaler.data_min_[0]);
            plt.show();

        score_dict["zip"].append(zip_num);
        score_dict["RMSE_train"].append(M_train);
        score_dict["RMSE_test"].append(M_test);
        score_dict["forecast"].append(list(np.array(results[-TEST_LENGTH:])*scaler.data_range_[0]+scaler.data_min_[0]));
        print(zip_num, M_train, M_test)

    return score_dict;

### The script

In [None]:
score_dict = NN_group_test(multi_zip, 0.004, True);

In [None]:
print(sum(score_dict["RMSE_test"])/len(score_dict["RMSE_test"]));

In [None]:
temp = score_dict.copy();

In [None]:
from sklearn.metrics import r2_score;
r2 = [];
actual = [];
predicted = [];
for i in range(len(multi_zip)):
    scaler = MinMaxScaler();
    frame = np.array(multi_data[multi_data.index == multi_zip[i]][["zri"]]);
    frame = scaler.fit_transform(frame);

    actual.extend(list(np.array(multi_data[multi_data.index == temp['zip'][i]]["zri"])[-TEST_LENGTH:]));
    predicted.extend(list(np.array(temp['forecast'][i])));

    # r2_zip = mean_squared_error(np.array(multi_data[multi_data.index == temp['zip'][i]]["zri"])[-TEST_LENGTH:], temp['forecast'][i]*scaler.data_range_[0]+scaler.data_min_[0]);
    # print(r2_zip, np.array(multi_data[multi_data.index == temp['zip'][i]]["zri"])[-TEST_LENGTH:], temp['forecast'][i]*scaler.data_range_[0]+scaler.data_min_[0]);
    # r2.append(r2_zip);

In [None]:
r2_score(actual, predicted)

In [None]:
(np.array(score_dict["RMSE_test"])**2).mean() ** 0.5

In [None]:
test = pd.DataFrame(score_dict);
test.to_csv('ZRI_and_all.csv', index = False);