In [None]:
import sys
!{sys.executable} -m pip install tensorflow

In [None]:
!{sys.executable} -m pip install numpy --upgrade --ignore-installed

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

In [14]:
df = pd.read_csv("data/train_monthly_naturalized_flow.csv")
df.head()

Unnamed: 0,site_id,forecast_year,year,month,volume
0,hungry_horse_reservoir_inflow,1911,1911,1,
1,hungry_horse_reservoir_inflow,1911,1911,2,85.071
2,hungry_horse_reservoir_inflow,1911,1911,3,121.825
3,hungry_horse_reservoir_inflow,1911,1911,4,224.172
4,hungry_horse_reservoir_inflow,1911,1911,5,748.602


In [12]:
df_dummies = pd.get_dummies(df, columns=['site_id'])
df_dummies.head()

Unnamed: 0,forecast_year,year,month,volume,site_id_animas_r_at_durango,site_id_boise_r_nr_boise,site_id_boysen_reservoir_inflow,site_id_colville_r_at_kettle_falls,site_id_detroit_lake_inflow,site_id_dillon_reservoir_inflow,...,site_id_pueblo_reservoir_inflow,site_id_ruedi_reservoir_inflow,site_id_skagit_ross_reservoir,site_id_snake_r_nr_heise,site_id_stehekin_r_at_stehekin,site_id_sweetwater_r_nr_alcova,site_id_taylor_park_reservoir_inflow,site_id_virgin_r_at_virtin,site_id_weber_r_nr_oakley,site_id_yampa_r_nr_maybell
0,1911,1911,1,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1911,1911,2,85.071,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1911,1911,3,121.825,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1911,1911,4,224.172,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1911,1911,5,748.602,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Assuming df is your DataFrame with the necessary data

def create_features(df, issue_date_month, forecast_year):
    # Filter data for the forecast year up to the issue date
    current_year_data = df[(df['forecast_year'] == forecast_year) & (df['month'] < issue_date_month)]
    current_year_total = df[(df['forecast_year'] == forecast_year) & 
                            (df['month'] >= 4) & (df['month'] <= 7)].rename(columns = {"volume":"final_volume"})
    # Historical April to July data from previous years
    historical_data = df[(df['forecast_year'] < forecast_year) & (df['month'].between(4, 7))]
    historical_sum = historical_data.groupby(['site_id', 'forecast_year'])['volume'].sum().reset_index()

    # Combine current year data with historical data
    combined_data = current_year_data.merge(historical_sum, on='site_id', 
                                            how='left').merge(current_year_total, on='site_id', how='left')

    # More feature engineering can be done here if needed
    combined_data = pd.get_dummies(combined_data, columns=['site_id'])

    return combined_data

def create_lstm_model(input_shape):
    # LSTM model definition
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50))
    model.add(Dense(25))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def train_and_predict(df, issue_date_month, forecast_year):
    features = create_features(df, issue_date_month, forecast_year)

    # Splitting features into X (inputs) and y (label)
    X = features.drop('final_volume', axis=1)  # Assuming 'volume' is the label
    y = features['final_volume']
    X = X.astype('float32')
    y = y.astype('float32')
    # Data splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Reshape input for LSTM
    X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

    # Model training
    model = create_lstm_model((X_train.shape[1], X_train.shape[2]))
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

    # Predictions
    predictions = model.predict(X_test)
    # Evaluate model performance

    return model, predictions

# Example usage
# df = pd.read_csv('your_data.csv')
# model, predictions = train_and_predict(df, issue_date_month=2, forecast_year=2023)


In [19]:
features = create_features(df, 5, 2022)
features.head()

Unnamed: 0,site_id,forecast_year_x,year_x,month_x,volume_x,forecast_year_y,volume_y,forecast_year,year_y,month_y,final_volume
0,hungry_horse_reservoir_inflow,2022,2022,1,74.073,1911,1922.539,2022,2022,4,222.303
1,hungry_horse_reservoir_inflow,2022,2022,1,74.073,1911,1922.539,2022,2022,5,674.407
2,hungry_horse_reservoir_inflow,2022,2022,1,74.073,1911,1922.539,2022,2022,6,1088.706
3,hungry_horse_reservoir_inflow,2022,2022,1,74.073,1912,1672.185,2022,2022,4,222.303
4,hungry_horse_reservoir_inflow,2022,2022,1,74.073,1912,1672.185,2022,2022,5,674.407


In [23]:
features = create_features(df, 5, 2022)

# Splitting features into X (inputs) and y (label)
X = features.drop('final_volume', axis=1)  # Assuming 'volume' is the label
y = features['final_volume']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X

Unnamed: 0,forecast_year_x,year_x,month_x,volume_x,forecast_year_y,volume_y,forecast_year,year_y,month_y,site_id_animas_r_at_durango,...,site_id_pueblo_reservoir_inflow,site_id_ruedi_reservoir_inflow,site_id_skagit_ross_reservoir,site_id_snake_r_nr_heise,site_id_stehekin_r_at_stehekin,site_id_sweetwater_r_nr_alcova,site_id_taylor_park_reservoir_inflow,site_id_virgin_r_at_virtin,site_id_weber_r_nr_oakley,site_id_yampa_r_nr_maybell
0,2022,2022,1,74.073,1911,1922.539,2022,2022,4,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,2022,1,74.073,1911,1922.539,2022,2022,5,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,2022,1,74.073,1911,1922.539,2022,2022,6,0,...,0,0,0,0,0,0,0,0,0,0
3,2022,2022,1,74.073,1912,1672.185,2022,2022,4,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,2022,1,74.073,1912,1672.185,2022,2022,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23231,2022,2022,4,67.368,2018,105.365,2022,2022,5,0,...,0,0,0,0,0,0,0,0,0,0
23232,2022,2022,4,67.368,2018,105.365,2022,2022,6,0,...,0,0,0,0,0,0,0,0,0,0
23233,2022,2022,4,67.368,2020,156.622,2022,2022,4,0,...,0,0,0,0,0,0,0,0,0,0
23234,2022,2022,4,67.368,2020,156.622,2022,2022,5,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
model, predictions = train_and_predict(df, issue_date_month=5, forecast_year = 2022)  # For example, May as issue date

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [26]:
predictions

array([[204.65746],
       [204.65746],
       [204.65746],
       ...,
       [204.65746],
       [204.65746],
       [204.65746]], dtype=float32)