In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime
import pickle
import random

### Load in (correct) data

In [2]:
base_path = "C:/Users/Matt/Dropbox/SnowComp/"
path1 = base_path+"ModisSnowImagesT.npy"
path2 = base_path+"ModisSnowImagesA.npy"
path3 = base_path+"ModisSnowImages_subT.npy"
path4 = base_path+"ModisSnowImages_subA.npy"

train_dataT = np.load(path1)
train_dataA = np.load(path2)
# sub_dataT = np.load(path3) #be careful about memory, this is about 25-30 gigs ram
# sub_dataA = np.load(path4)

#load cell ids, note Ts are correct, As are actual daynums 
path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_idsT.pkl"
with open(path_id, 'rb') as handle:
    cell_ids = pickle.load( handle)
    
train_y = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/train_labels.csv")

In [3]:
#labels helpers and processing
def pivot_df(df, id_col, ignore_cols=None):
    if not ignore_cols:
        ignore_cols = []
    date_cols = [x for x in df.columns if x not in [id_col] + ignore_cols]
    dfs = []
    for day in date_cols:
        day_df = df[[id_col, day]].rename({day: 'snowpack'}, axis=1)
        day_df['date'] = day
        dfs.append(day_df)
    return pd.concat(dfs)

def daynum_gen(date_time):
    '''converts date time objects to filename'''
    date_time = datetime.fromisoformat(date_time)
    doy = date_time.timetuple().tm_yday
    year = date_time.year
    return str(year) + '{:03d}'.format(doy)

train_y = pivot_df(train_y, 'cell_id').dropna()
train_y['date']=train_y['date'].map(daynum_gen)

In [4]:
#sort train_y so it has correct order before stripping labels
train_y['idx'] = train_y['cell_id'] +"-"+train_y['date']
sorter = [idx +"-" +date for  idx, date  in cell_ids]
train_y = train_y.set_index('idx')
train_y = train_y.loc[sorter]

#combine Aqua and Terra DSs
dataset = np.concatenate((train_dataT[:,0:1,:,:],train_dataA[:,0:1,:,:]), axis = 1)
# dataset = np.concatenate((train_dataT,train_dataA), axis = 1)

# #delete problematic columns
# dataset = np.delete(dataset, 3, 1) 
# dataset = np.delete(dataset, 9, 1)

dataset= dataset/255

# Basic pytorch CNN

In [5]:
#@title Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

In [6]:
# Do categorical preds to start (change)
train_y['cat'] = train_y['snowpack'].apply(lambda x: 1 if x > 15 else 0)
# dataset.shape, train_y.shape

In [7]:
#@title Define simple CNN
# From: https://pytorch.org/tutorials/recipes/recipes/defining_a_neural_network.html
# Also used: https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html

START_D = 2
START_HW = 21

class Net(nn.Module):

    def _conv_calc(self, in_dim, pad, stride, k):
        out = int(np.floor((in_dim + 2 * pad - (k - 1) - 1) / stride + 1))
        return out

    def __init__(self, cdim1, cdim2, kernel_sz, dropout,
                 ldim):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(2, cdim1, kernel_sz, 1)
        c1_dim = self._conv_calc(START_HW, 0, 1, kernel_sz)
        print('c1 dim:', c1_dim)
        
        self.conv2 = nn.Conv2d(cdim1, cdim2, kernel_sz, 1)
        c2_dim = self._conv_calc(c1_dim, 0, 1, kernel_sz)
        print('c2 dim:', c2_dim)
        
        self.maxpool1 = nn.MaxPool2d(2)
        mp1_dim = self._conv_calc(c2_dim, 0, 2, 2)
        # print('mp1 dim:', mp1_dim)
        
        # self.conv3 = nn.Conv2d(cdim2, cdim2, kernel_sz, 1)

        self.dropout1 = nn.Dropout2d(dropout)
        self.dropout2 = nn.Dropout2d(2 * dropout)

        flattened_dim = cdim2 * mp1_dim * mp1_dim
        print(flattened_dim)
        self.fc1 = nn.Linear(flattened_dim, ldim)
        # self.fc1 = nn.Linear(8192, ldim)
        self.fc2 = nn.Linear(ldim, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        
        # x = self.dropout1(x)
        # x = self.conv3(x)
        # x = F.relu(x)

        # x = F.max_pool2d(x, 2)
        x = self.maxpool1(x)
        
        # x = self.dropout1(x)
        x = torch.flatten(x, 1)

        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        
        output = self.fc2(x)
        return output

    




In [8]:
#@title Helpers to get predictions and accuracy
def predict(cnn, x, as_numpy=False):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    cnn.eval()
    x =x.type(torch.FloatTensor).to(device)
    output = cnn(x)
    if as_numpy:
        output = output.flatten().cpu().detach().numpy() #detach removes gradients (bad)
        
    cnn.train()
    return output.squeeze()

def get_accuracy(cnn, x, y):
#     y = torch.from_numpy(y).to(device)
    outputs = predict(cnn, x,as_numpy = True)
    
#     print(y.shape, outputs.shape)
    loss = ((y-outputs)**2).mean()
    return round(loss.item(), 6)



In [9]:
#@title Test run
# my_nn = Net(cdim1=8, cdim2=8, kernel_sz=3, dropout=0.25, ldim=8)
# optimizer = optim.Adam(my_nn.parameters(), lr=0.1)
# optimizer.zero_grad()

# test_im = torch.from_numpy(dataset[0]).reshape(1, 14, 21, 21)
# result = my_nn(test_im.type(torch.FloatTensor))
# result.shape


In [21]:
len(dataset)

91490

In [22]:
#@title split training and testing

mask = np.random.rand(len(dataset)) < 0.9
training_data = dataset[mask]
testing_data = dataset[~mask]

In [26]:
#@title Get data loaders
# train_dataset = TensorDataset(torch.Tensor(dataset),
#                               torch.Tensor(train_y['snowpack']))
# train_loader = DataLoader(train_dataset, batch_size=64)

# mini_x, mini_y = np.array(dataset)[:2,], np.array(train_y['snowpack'])[:2,]

test_x, test_y = np.array(dataset)[~mask], np.array(train_y['snowpack'])[~mask]
test_x, test_y = torch.Tensor(test_x), torch.Tensor(test_y)


mini_x, mini_y = np.array(dataset)[mask], np.array(train_y['snowpack'])[mask]
mini_x, mini_y = torch.Tensor(mini_x), torch.Tensor(mini_y)

mini_dataset = TensorDataset(mini_x,
                              mini_y)
mini_loader = DataLoader(mini_dataset, batch_size=200)

In [11]:
#@title Setup net
my_nn = Net(cdim1=32, cdim2=24, kernel_sz=3, dropout=0, ldim=14)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_nn.to(device)

optimizer = optim.Adam(my_nn.parameters(), lr=0.001)
criterion = nn.MSELoss()
writer = SummaryWriter('runs/cnn_full')
write_index = 0




512


In [12]:
N_EPOCHS = 199

#@title Run net
for epoch in range(N_EPOCHS):
    running_loss = 0.0
    for i, data in enumerate(mini_loader, 0):
        optimizer.zero_grad()
        inputs, labels = data[0].to(device), data[1].to(device)

        outputs = my_nn(inputs).squeeze()
#         print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
#         for name, param in my_nn.named_parameters():
#             print(name, param.grad.abs().sum())

        running_loss += loss.item()
        if (i + 1) % 100 == 0:
            writer.add_scalar('Loss/train', running_loss , write_index)
        write_index += 1

    val_acc = get_accuracy(my_nn, mini_x, mini_y)
    writer.add_scalar('Acc/val', val_acc, write_index)
    print(epoch, '/', N_EPOCHS, 'complete', val_acc, running_loss )

    #calculate test loss.
    
    
    
writer.close()
print('Finished Training')

0 / 200 complete 285.473907 114019.33647346497
1 / 200 complete 280.01358 106594.94483876228
2 / 200 complete 273.783112 103227.64334869385
3 / 200 complete 270.166626 100228.558994174
4 / 200 complete 254.996521 95978.62303519249
5 / 200 complete 253.728577 91759.81201410294
6 / 200 complete 239.712631 88855.48102980852
7 / 200 complete 240.309174 86566.37278831005
8 / 200 complete 239.058365 85393.29599505663
9 / 200 complete 238.335922 84956.74796444178
10 / 200 complete 237.549301 84663.40807497501
11 / 200 complete 233.613419 84511.97795438766
12 / 200 complete 238.518204 84447.63498139381
13 / 200 complete 232.480713 82999.59153664112
14 / 200 complete 230.564407 82575.09020650387
15 / 200 complete 230.418518 82049.74405807257
16 / 200 complete 226.537979 81850.76433259249
17 / 200 complete 230.017746 81745.8089197874
18 / 200 complete 225.154602 81389.58358746767
19 / 200 complete 227.541168 81256.26586401463
20 / 200 complete 223.170029 80931.53489857912
21 / 200 complete 224.1

174 / 200 complete 192.674652 70310.9325158596
175 / 200 complete 202.707886 70280.02042293549
176 / 200 complete 192.248337 69962.40660023689
177 / 200 complete 192.545242 70107.09295225143
178 / 200 complete 192.645004 69972.55846977234
179 / 200 complete 192.143234 70043.43641638756
180 / 200 complete 192.417969 69826.37281036377
181 / 200 complete 191.899811 69915.9555490017
182 / 200 complete 191.980209 69789.33359646797
183 / 200 complete 191.848267 69881.27257442474
184 / 200 complete 191.8694 69779.55110478401
185 / 200 complete 192.159576 69887.07304692268
186 / 200 complete 191.718948 69883.86778998375
187 / 200 complete 191.701035 69680.87061047554
188 / 200 complete 191.884537 69914.88537836075
189 / 200 complete 191.636063 69649.57746076584
190 / 200 complete 191.720642 69602.39973449707
191 / 200 complete 191.358688 69582.7741010189
192 / 200 complete 191.665588 69539.42324185371
193 / 200 complete 191.025406 69358.09476852417
194 / 200 complete 191.16188 69372.9112365245

In [13]:
torch.cuda.empty_cache()

### Checking accuracy

In [14]:
# my_nn = my_nn.to(device)

In [15]:
my_nn = my_nn.to('cpu')
vals = my_nn(mini_x)

vals

tensor([[20.7583],
        [-1.7966],
        [ 8.3834],
        ...,
        [ 9.8723],
        [10.8684],
        [11.0105]], grad_fn=<AddmmBackward0>)

In [16]:
mini_y

tensor([22.5000,  0.0000, 16.7000,  ..., 41.5000,  2.9000, 11.5000])

In [19]:
np.sqrt(((vals.detach().numpy().flatten()  - mini_y.numpy())**2).mean())

13.79288

In [18]:
vals.detach().numpy()

array([[20.758299 ],
       [-1.7966311],
       [ 8.383375 ],
       ...,
       [ 9.872345 ],
       [10.868359 ],
       [11.010481 ]], dtype=float32)

## Things to check

1. Accuracy measures are right
2. Check missing value

### TODO:
- Add test set/cv
- Batchnorm?
- CNN benchmark