In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("../data/SnowData.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Date,Station Name,Snow Water Equivalent (in) Start of Day Values,Change In Snow Water Equivalent (in),Snow Depth (in) Start of Day Values,Change In Snow Depth (in),Elevation (ft),Air Temperature Average (degF),Latitude,Longitude,County Name
0,0,2000-01-01,Apishapa,4.1,0.4,12.0,0.0,10000,24.0,37.33062,-105.06749,Huerfano
1,1,2000-01-02,Apishapa,4.1,0.0,12.0,0.0,10000,18.0,37.33062,-105.06749,Huerfano
2,2,2000-01-03,Apishapa,4.1,0.0,12.0,0.0,10000,6.0,37.33062,-105.06749,Huerfano
3,3,2000-01-04,Apishapa,4.1,0.0,15.0,3.0,10000,21.0,37.33062,-105.06749,Huerfano
4,4,2000-01-05,Apishapa,4.5,0.4,14.0,-1.0,10000,24.0,37.33062,-105.06749,Huerfano
...,...,...,...,...,...,...,...,...,...,...,...,...
814973,814973,2021-05-13,Zirkel,10.3,0.2,21.0,2.0,9340,46.0,40.79488,-106.59535,Jackson
814974,814974,2021-05-14,Zirkel,8.4,-1.9,18.0,-3.0,9340,45.0,40.79488,-106.59535,Jackson
814975,814975,2021-05-15,Zirkel,7.2,-1.2,17.0,-1.0,9340,46.0,40.79488,-106.59535,Jackson
814976,814976,2021-05-16,Zirkel,6.1,-1.1,15.0,-2.0,9340,45.0,40.79488,-106.59535,Jackson


In [6]:
# convert date to numaric day of the year
dates=df['Date'].astype('datetime64[ns]')
dates=dates.dt.dayofyear
df['Date']=dates
df

Unnamed: 0.1,Unnamed: 0,Date,Station Name,Snow Water Equivalent (in) Start of Day Values,Change In Snow Water Equivalent (in),Snow Depth (in) Start of Day Values,Change In Snow Depth (in),Elevation (ft),Air Temperature Average (degF),Latitude,Longitude,County Name
0,0,1,Apishapa,4.1,0.4,12.0,0.0,10000,24.0,37.33062,-105.06749,Huerfano
1,1,2,Apishapa,4.1,0.0,12.0,0.0,10000,18.0,37.33062,-105.06749,Huerfano
2,2,3,Apishapa,4.1,0.0,12.0,0.0,10000,6.0,37.33062,-105.06749,Huerfano
3,3,4,Apishapa,4.1,0.0,15.0,3.0,10000,21.0,37.33062,-105.06749,Huerfano
4,4,5,Apishapa,4.5,0.4,14.0,-1.0,10000,24.0,37.33062,-105.06749,Huerfano
...,...,...,...,...,...,...,...,...,...,...,...,...
814973,814973,133,Zirkel,10.3,0.2,21.0,2.0,9340,46.0,40.79488,-106.59535,Jackson
814974,814974,134,Zirkel,8.4,-1.9,18.0,-3.0,9340,45.0,40.79488,-106.59535,Jackson
814975,814975,135,Zirkel,7.2,-1.2,17.0,-1.0,9340,46.0,40.79488,-106.59535,Jackson
814976,814976,136,Zirkel,6.1,-1.1,15.0,-2.0,9340,45.0,40.79488,-106.59535,Jackson


In [7]:
# df.columns

In [8]:
# df.hist(figsize=(10,10))
# plt.show

In [9]:
df.columns

Index(['Unnamed: 0', 'Date', 'Station Name',
       'Snow Water Equivalent (in) Start of Day Values',
       'Change In Snow Water Equivalent (in)',
       'Snow Depth (in) Start of Day Values', 'Change In Snow Depth (in)',
       'Elevation (ft)', 'Air Temperature Average (degF)', 'Latitude',
       'Longitude', 'County Name'],
      dtype='object')

In [11]:
one_hot = pd.get_dummies(df['Station Name'])
# Drop column as it is now encoded
data_df = df.drop(['Station Name','County Name'],axis = 1)
# Join the encoded df
data_df = data_df.join(one_hot)
data_df 

Unnamed: 0.1,Unnamed: 0,Date,Snow Water Equivalent (in) Start of Day Values,Change In Snow Water Equivalent (in),Snow Depth (in) Start of Day Values,Change In Snow Depth (in),Elevation (ft),Air Temperature Average (degF),Latitude,Longitude,...,Vail Mountain,Vallecito,Wager Gulch,Weminuche Creek,Whiskey Ck,Wild Basin,Willow Creek Pass,Willow Park,Wolf Creek Summit,Zirkel
0,0,1,4.1,0.4,12.0,0.0,10000,24.0,37.33062,-105.06749,...,0,0,0,0,0,0,0,0,0,0
1,1,2,4.1,0.0,12.0,0.0,10000,18.0,37.33062,-105.06749,...,0,0,0,0,0,0,0,0,0,0
2,2,3,4.1,0.0,12.0,0.0,10000,6.0,37.33062,-105.06749,...,0,0,0,0,0,0,0,0,0,0
3,3,4,4.1,0.0,15.0,3.0,10000,21.0,37.33062,-105.06749,...,0,0,0,0,0,0,0,0,0,0
4,4,5,4.5,0.4,14.0,-1.0,10000,24.0,37.33062,-105.06749,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814973,814973,133,10.3,0.2,21.0,2.0,9340,46.0,40.79488,-106.59535,...,0,0,0,0,0,0,0,0,0,1
814974,814974,134,8.4,-1.9,18.0,-3.0,9340,45.0,40.79488,-106.59535,...,0,0,0,0,0,0,0,0,0,1
814975,814975,135,7.2,-1.2,17.0,-1.0,9340,46.0,40.79488,-106.59535,...,0,0,0,0,0,0,0,0,0,1
814976,814976,136,6.1,-1.1,15.0,-2.0,9340,45.0,40.79488,-106.59535,...,0,0,0,0,0,0,0,0,0,1


In [12]:
samp_df=data_df.sample(frac=0.5, replace=True, random_state=1)

In [15]:
list(samp_df.columns)

['Unnamed: 0',
 'Date',
 'Snow Water Equivalent (in) Start of Day Values',
 'Change In Snow Water Equivalent (in)',
 'Snow Depth (in) Start of Day Values',
 'Change In Snow Depth (in)',
 'Elevation (ft)',
 'Air Temperature Average (degF)',
 'Latitude',
 'Longitude',
 'Apishapa',
 'Arapaho Ridge',
 'Bear Lake',
 'Bear River',
 'Beartown',
 'Beaver Ck Village',
 'Berthoud Summit',
 'Bison Lake',
 'Black Mesa',
 'Black Mountain',
 'Brumley',
 'Buckskin Joe',
 'Buffalo Park',
 'Burro Mountain',
 'Butte',
 'Cascade',
 'Cascade #2',
 'Chapman Tunnel',
 'Cochetopa Pass',
 'Columbine',
 'Columbine Pass',
 'Columbus Basin',
 'Copeland Lake',
 'Copper Mountain',
 'Crosho',
 'Culebra #2',
 'Cumbres Trestle',
 'Deadman Hill',
 'Dry Lake',
 'Echo Lake',
 'El Diente Peak',
 'Elk River',
 'Elkhead Divide',
 'Elliot Ridge',
 'Fool Creek',
 'Fremont Pass',
 'Glen Cove',
 'Grayback',
 'Grizzly Peak',
 'Hayden Pass',
 'High Lonesome',
 'Hoosier Pass',
 'Hourglass Lake',
 'Idarado',
 'Independence Pass',


# Select your features (columns)

In [7]:
# 'Change In Snow Water Equivalent (in)',

In [16]:
# Set features. This will also be used as your x values.
xdf = samp_df[['Date',
 'Snow Water Equivalent (in) Start of Day Values',
 'Change In Snow Water Equivalent (in)',
 'Snow Depth (in) Start of Day Values',
 'Elevation (ft)',
 'Air Temperature Average (degF)',
 'Latitude',
 'Longitude',
 'Apishapa',
 'Arapaho Ridge',
 'Bear Lake',
 'Bear River',
 'Beartown',
 'Beaver Ck Village',
 'Berthoud Summit',
 'Bison Lake',
 'Black Mesa',
 'Black Mountain',
 'Brumley',
 'Buckskin Joe',
 'Buffalo Park',
 'Burro Mountain',
 'Butte',
 'Cascade',
 'Cascade #2',
 'Chapman Tunnel',
 'Cochetopa Pass',
 'Columbine',
 'Columbine Pass',
 'Columbus Basin',
 'Copeland Lake',
 'Copper Mountain',
 'Crosho',
 'Culebra #2',
 'Cumbres Trestle',
 'Deadman Hill',
 'Dry Lake',
 'Echo Lake',
 'El Diente Peak',
 'Elk River',
 'Elkhead Divide',
 'Elliot Ridge',
 'Fool Creek',
 'Fremont Pass',
 'Glen Cove',
 'Grayback',
 'Grizzly Peak',
 'Hayden Pass',
 'High Lonesome',
 'Hoosier Pass',
 'Hourglass Lake',
 'Idarado',
 'Independence Pass',
 'Ivanhoe',
 'Jackwhacker Gulch',
 'Joe Wright',
 'Jones Pass',
 'Kiln',
 'Lake Eldora',
 'Lake Irene',
 'Lily Pond',
 'Lizard Head Pass',
 'Lone Cone',
 'Long Draw Resv',
 'Lost Dog',
 'Loveland Basin',
 'Lynx Pass',
 'Mancos',
 'Mc Clure Pass',
 'Mccoy Park',
 'Medano Pass',
 'Mesa Lakes',
 'Michigan Creek',
 'Middle Creek',
 'Middle Fork Camp',
 'Mineral Creek',
 'Molas Lake',
 'Moon Pass',
 'Nast Lake',
 'Never Summer',
 'Niwot',
 'North Lost Trail',
 'Overland Res.',
 'Park Cone',
 'Park Reservoir',
 'Phantom Valley',
 'Porphyry Creek',
 'Rabbit Ears',
 'Rawah',
 'Red Mountain Pass',
 'Ripple Creek',
 'Roach',
 'Rough And Tumble',
 'Saint Elmo',
 'Sargents Mesa',
 'Sawtooth',
 'Schofield Pass',
 'Scotch Creek',
 'Sharkstooth',
 'Slumgullion',
 'South Colony',
 'Spud Mountain',
 'Stillwater Creek',
 'Stump Lakes',
 'Summit Ranch',
 'Tower',
 'Trapper Lake',
 'Trinchera',
 'University Camp',
 'Upper Rio Grande',
 'Upper San Juan',
 'Upper Taylor',
 'Ute Creek',
 'Vail Mountain',
 'Vallecito',
 'Wager Gulch',
 'Weminuche Creek',
 'Whiskey Ck',
 'Wild Basin',
 'Willow Creek Pass',
 'Willow Park',
 'Wolf Creek Summit',
 'Zirkel']]
xdf

Unnamed: 0,Date,Snow Water Equivalent (in) Start of Day Values,Change In Snow Water Equivalent (in),Snow Depth (in) Start of Day Values,Elevation (ft),Air Temperature Average (degF),Latitude,Longitude,Apishapa,Arapaho Ridge,...,Vail Mountain,Vallecito,Wager Gulch,Weminuche Creek,Whiskey Ck,Wild Basin,Willow Creek Pass,Willow Park,Wolf Creek Summit,Zirkel
247379,202,0.0,0.0,9.0,10160,64.0,38.89433,-106.95300,0,0,...,0,0,0,0,0,0,0,0,0,0
640518,316,0.0,0.0,0.0,8700,36.0,39.29722,-106.60694,0,0,...,0,0,0,0,0,0,0,0,0,0
619550,181,0.0,0.0,0.0,11400,48.0,39.36127,-106.05978,0,0,...,0,0,0,0,0,0,0,0,0,0
640025,188,0.0,0.0,0.0,11140,57.0,37.96661,-106.55837,0,0,...,0,0,0,0,0,0,0,0,0,0
518737,5,5.5,0.2,30.0,10110,2.0,39.26217,-106.62931,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579958,172,0.0,0.0,0.0,10200,52.0,37.48576,-106.83535,0,0,...,0,0,0,0,0,0,0,0,0,0
376666,109,20.2,0.1,54.0,10120,38.0,40.53215,-105.88700,0,0,...,0,0,0,0,0,0,0,0,0,0
309763,160,0.0,0.0,0.0,9030,44.0,40.39937,-105.84757,0,0,...,0,0,0,0,0,0,0,0,0,0
255785,89,5.2,0.0,14.0,8600,39.0,40.20778,-105.56861,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
xdf.dtypes

Date                                                int64
Snow Water Equivalent (in) Start of Day Values    float64
Change In Snow Water Equivalent (in)              float64
Snow Depth (in) Start of Day Values               float64
Elevation (ft)                                      int64
                                                   ...   
Wild Basin                                          uint8
Willow Creek Pass                                   uint8
Willow Park                                         uint8
Wolf Creek Summit                                   uint8
Zirkel                                              uint8
Length: 123, dtype: object

# Create a Train Test Split

Use "Change In Snow Depth (in)" for the y values

In [20]:
X =xdf
y = samp_df["Change In Snow Depth (in)"].values.reshape(-1, 1)
print(X.shape, y.shape)
# y

(332310, 123) (332310, 1)


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [78]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(344529, 159) (114844, 159) (344529, 1) (114844, 1)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [22]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [77]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled.shape

(249232, 123)

In [94]:
X_train_scaled_1 = X_scaler.transform(X_train)
X_test_scaled_1 = X_scaler.transform(X_test)
X_train_scaled_1.shape

(249232, 123)

# Train the Model



In [95]:
from tensorflow.keras.utils import to_categorical

In [96]:
y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)
y_train_scaled.shape

(249232, 1)

In [97]:
y_train_categorical = to_categorical(y_train_scaled)
y_test_categorical = to_categorical(y_test_scaled)
y_train_categorical.shape

(249232, 2)

In [98]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



In [99]:
model = Sequential()

# model.add(Dense(units=10, activation='relu', input_shape=(,122))
# model.add(Dense(units=10, activation='relu', input_dim=122))
model.add(Dense(units=123, activation='relu', batch_input_shape=(None, 123)))
model.add(Dense(units=10, activation='softmax'))


In [100]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [101]:
model.fit(
    X_train_scaled_1,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10


ValueError: in user code:

    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\engine\training.py:797 train_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\losses.py:155 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\losses.py:259 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\losses.py:1644 categorical_crossentropy
        y_true, y_pred, from_logits=from_logits)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\keras\backend.py:4862 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\tiij8\anaconda3\envs\PythonAdv\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 2) and (None, 10) are incompatible


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [65]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [66]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <tensorflow.python.keras.engine.sequential.Sequential object at 0x000002012C9A49B0> does not.

In [None]:
# print(grid.best_params_)
# print(grid.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)