In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime
import pickle
import random

rng = np.random.default_rng(926834)

In [2]:
#labels helpers and processing
def pivot_df(df, id_col, ignore_cols=None):
    if not ignore_cols:
        ignore_cols = []
    date_cols = [x for x in df.columns if x not in [id_col] + ignore_cols]
    dfs = []
    for day in date_cols:
        day_df = df[[id_col, day]].rename({day: 'snowpack'}, axis=1)
        day_df['date'] = day
        dfs.append(day_df)
    return pd.concat(dfs)

def daynum_gen(date_time):
    '''converts date time objects to filename'''
    date_time = datetime.fromisoformat(date_time)
    doy = date_time.timetuple().tm_yday
    year = date_time.year
    return str(year) + '{:03d}'.format(doy)

### Load data

In [3]:
base_path = "C:/Users/Matt/Dropbox/SnowComp/"
path1 = base_path+"ModisSnowImagesT.npy"
path2 = base_path+"ModisSnowImagesA.npy"
path3 = base_path+"ModisSnowImages_subT.npy"
path4 = base_path+"ModisSnowImages_subA.npy"

#submission small
path5 =base_path +"ModisSnowImages_subAT_small.npy"
path6 =base_path +"ModisSnowImages_AT_small.npy"

#train_features
path7 =base_path + "ModisSnowImages_trainfeat_T.npy"
path8 =base_path + "ModisSnowImages_trainfeat_A.npy"

#test_features
path9 =base_path + "ModisSnowImages_testfeat_T.npy"
path10=base_path + "ModisSnowImages_testfeat_A.npy"

In [4]:
#load cell ids, note Ts are correct, As are actual daynums 
path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_idsT.pkl"
with open(path_id, 'rb') as handle:
    cell_ids = pickle.load( handle)

path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_ids_trainfeat.pkl"
with open(path_id, 'rb') as handle:
    cell_ids_train = pickle.load(handle)
    
path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_ids_testfeat.pkl"
with open(path_id, 'rb') as handle:
    cell_ids_test = pickle.load( handle)

In [5]:
# #Assemble submission dataset
# sub_dataT = np.load(path3) #be careful about memory, this is about 25-30 gigs ram
# sub_dataA = np.load(path4)

# sub_dataset = np.concatenate((sub_dataT[:,0:1,:,:],sub_dataA[:,0:1,:,:]), axis = 1)
# sub_dataset = sub_dataset/255
# np.save(path5,sub_dataset)

# sub_dataset = torch.Tensor(sub_dataset)
# del sub_dataT, sub_dataA

#### Read in and process `train_label` data

In [6]:
train_y = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/train_labels.csv")
train_y = pivot_df(train_y, 'cell_id').dropna()
train_y['date']=train_y['date'].map(daynum_gen)

#sort train_y so it has correct order before stripping labels
sorter = [idx +"-" +date for  idx, date  in cell_ids]
# with open(base_path + "train_idorder.pkl", 'wb') as handle:
#     pickle.dump(sorter, handle)

train_y['idx'] = train_y['cell_id'] +"-"+train_y['date']
train_y = train_y.set_index('idx')
train_y = train_y.loc[sorter]

In [7]:
#combine Aqua and Terra DSs
train_dataT = np.load(path1)
train_dataA = np.load(path2)

dataset = np.concatenate((train_dataT[:,0:1,:,:],train_dataA[:,0:1,:,:]), axis = 1)
# dataset = np.concatenate((train_dataT[:,(0,4),:,:],train_dataA[:,(0,4),:,:]), axis = 1)
# dataset = np.concatenate((train_dataT,train_dataA), axis = 1)
START_D = dataset.shape[1]

dataset= dataset/255
# np.save(path6,dataset)

del train_dataT
del train_dataA

#### Read in and order `train_features` data 

In [8]:
train_feat = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/ground_measures_train_features.csv")
train_feat = train_feat.rename(columns={"Unnamed: 0":"cell_id"})
train_feat = pivot_df(train_feat, 'cell_id').dropna()
train_feat['date']=train_feat['date'].map(daynum_gen)

#sort train_feat so it has correct order before stripping labels
sorter = [idx +"-" +date for  idx, date  in cell_ids_train]
with open(base_path + "trainfeat_idorder.pkl", 'wb') as handle:
    pickle.dump(sorter, handle)

train_feat['idx'] = train_feat['cell_id'] +"-"+train_feat['date']
train_feat = train_feat.set_index('idx')
train_feat = train_feat.loc[sorter]

In [9]:
#combine Aqua and Terra DSs
train_feat_dataT = np.load(path7)
train_feat_dataA = np.load(path8)

dataset_temp = np.concatenate((train_feat_dataT[:,0:1,:,:],train_feat_dataA[:,0:1,:,:]), axis = 1)
# dataset_temp = np.concatenate((train_feat_dataT[:,(0,4),:,:],train_feat_dataA[:,(0,4),:,:]), axis = 1)
# dataset_temp = np.concatenate((train_feat_dataT,train_feat_dataA), axis = 1)

dataset_temp= dataset_temp/255
# np.save(path6,dataset_temp)

del train_feat_dataT, train_feat_dataA

#### Read in and order `test_features` data

In [10]:
y_test = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/ground_measures_test_features.csv")
y_test = y_test.rename(columns={"Unnamed: 0":"cell_id"})
y_test = pivot_df(y_test, 'cell_id').dropna()
y_test['date']=y_test['date'].map(daynum_gen)

#sort train_feat so it has correct order before stripping labels
sorter = [idx +"-" +date for  idx, date  in cell_ids_test]
with open(base_path + "testfeat_idorder.pkl", 'wb') as handle:
    pickle.dump(sorter, handle)

y_test['idx'] = y_test['cell_id'] +"-"+y_test['date']
y_test = y_test.set_index('idx')
y_test = y_test.loc[sorter]

In [11]:
#combine Aqua and Terra DSs
test_feat_dataT = np.load(path9)
test_feat_dataA = np.load(path10)

dataset_test = np.concatenate((test_feat_dataT[:,0:1,:,:],test_feat_dataA[:,0:1,:,:]), axis = 1)
# dataset_temp = np.concatenate((test_feat_dataT[:,(0,4),:,:],test_feat_dataA[:,(0,4),:,:]), axis = 1)
# dataset_temp = np.concatenate((test_feat_dataT,test_feat_dataA), axis = 1)

dataset_test= dataset_test/255
# np.save(path6,dataset_temp)

del test_feat_dataT, test_feat_dataA

Holdout 15% of `train_labels` and smoosh everything together

In [12]:
#holdout 
mask = rng.random(len(dataset)) < 0.85

test_grid = dataset[~mask]
dataset = dataset[mask]
test_grid_y = train_y[~mask]
train_y = train_y[mask]

print(dataset.shape)
print(test_grid.shape)
print(test_grid_y.shape)
print(train_y.shape)

(77718, 2, 21, 21)
(13772, 2, 21, 21)
(13772, 3)
(77718, 3)


In [13]:
#training data
train_dataset = np.concatenate((dataset,
                   dataset_temp), axis = 0)
train_y = pd.concat((train_y, train_feat),
                    axis = 0)

train_y = train_y.reset_index(drop=True)
del dataset_temp, train_feat

#shuffle
p =rng.permutation(len(train_dataset))
train_y = train_y.loc[p]
train_dataset = train_dataset[p]

In [14]:
#test data
dataset_test = np.concatenate((dataset_test, test_grid),
                              axis = 0) 
y_test = pd.concat((y_test, test_grid_y),
                    axis = 0)

y_test = y_test.reset_index(drop=True)
del test_grid_y, test_grid

#shuffle
p =rng.permutation(len(dataset_test))
y_test = y_test.loc[p]
dataset_test = dataset_test[p]

In [15]:
data_path = "C:/Users/Matt/Dropbox/SnowComp/FinalData/"

train_y.to_csv(data_path + "train_y.csv", index = False)
y_test.to_csv(data_path + "test_y.csv", index = False)

np.save(data_path + "train_dataset.npy", train_dataset)
np.save(data_path + "test_datast.npy", dataset_test)

## Translate to Pytorch datatypes and validation

Redefine train and testing datasets 

In [16]:
#@title Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

In [17]:
#@title split training and testing
training_data = train_dataset
testing_data = dataset_test
train_rows = len(training_data)
test_rows = len(testing_data)

In [18]:
#@title Get data loaders
test_x, test_y = dataset_test, np.array(y_test['snowpack'])
test_x, test_y = torch.Tensor(test_x), torch.Tensor(test_y)
test_dataset = TensorDataset(test_x,test_y)
test_loader = DataLoader(test_dataset, batch_size = 256)

mini_x, mini_y = train_dataset, np.array(train_y['snowpack'])
mini_x, mini_y = torch.Tensor(mini_x), torch.Tensor(mini_y)

mini_dataset = TensorDataset(mini_x,
                              mini_y)
mini_loader = DataLoader(mini_dataset, batch_size=256)

# CNN

In [19]:
#@title Define simple CNN
# From: https://pytorch.org/tutorials/recipes/recipes/defining_a_neural_network.html
# Also used: https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html

START_HW = 21

class Net(nn.Module):

    def _conv_calc(self, in_dim, pad, stride, k):
        out = int(np.floor((in_dim + 2 * pad - (k - 1) - 1) / stride + 1))
        return out

    def __init__(self, cdim1, cdim2,cdim3, kernel_sz, dropout,
                 ldim, print_dim = True):
        super(Net, self).__init__()
        
        #first layer
        self.conv1 = nn.Conv2d(START_D, cdim1, kernel_sz, 1)
        self.avgpool = nn.AvgPool2d(3, stride= 1)
        
        c1_dim = self._conv_calc(START_HW, 0, 1, kernel_sz)
        mp0_dim = self._conv_calc(c1_dim, 0, 1, 3)
        
        #second layer
        self.conv2 = nn.Conv2d(cdim1, cdim2, kernel_sz, 1)
        c2_dim = self._conv_calc(mp0_dim, 0, 1, kernel_sz)
        mp1_dim = self._conv_calc(c2_dim, 0, 1, 3)
        
        
        #third layer
        self.conv3 = nn.Conv2d(cdim2, cdim3, kernel_sz, 1)
        c3_dim = self._conv_calc(mp1_dim, 0, 1, kernel_sz)
        mp2_dim = self._conv_calc(c3_dim, 0, 1, 3)
        
        #fourth layer
        flattened_dim = cdim3 * mp2_dim * mp2_dim
        self.fc1 = nn.Linear(flattened_dim, ldim)
        self.fc2 = nn.Linear(ldim, 1)

        #extras
        self.dropout1 = nn.Dropout2d(dropout)
        self.dropout2 = nn.Dropout2d(dropout*2)
        self.BatchNorm1 = nn.BatchNorm2d(cdim1)
        self.BatchNorm3 = nn.BatchNorm2d(cdim3)
        
        if print_dim:
            print('c1 dim:', c1_dim)
            print('mp0 dim:', mp0_dim)
            print('c2 dim:', c2_dim)
            print('mp1 dim:', mp1_dim)
            print('c3 dim:', c3_dim)
            print('mp2 dim:', mp2_dim)
            print('flattened_dim',flattened_dim)
        
        
    def forward(self, x):
        #first layer
        x = self.conv1(x)
#         x = torch.tanh(x)
        x = F.relu(x)
        x = self.BatchNorm1(x)
        x = self.avgpool(x)
        
        #second layer
        x = self.dropout1(x)
        x = self.conv2(x)
#         x = torch.tanh(x)
        x = F.relu(x)
        x = self.avgpool(x)
        
        #third layer
        x = self.dropout2(x)
        x = self.conv3(x)
        x = torch.tanh(x)
        x = F.relu(x)
        x = self.BatchNorm3(x)
        x = self.avgpool(x)
        
        #fourth layer
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        
        output = self.fc2(x)
        return output

    




In [20]:
#@title Helpers to get predictions and accuracy
def predict(cnn, x, as_numpy=False):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    cnn.eval()
    x =x.type(torch.FloatTensor).to(device)
    output = cnn(x)
    if as_numpy:
        output = output.flatten().cpu().detach().numpy() #detach removes gradients (bad)
        
    cnn.train()
    return output.squeeze()

def get_accuracy(cnn, x, y):
#     y = torch.from_numpy(y).to(device)
    outputs = predict(cnn, x,as_numpy = False)
    
#     print(y.shape, outputs.shape)
    loss = ((y-outputs)**2).sum()
    return round(loss.item(), 6)



In [21]:
#@title Setup net
cdim1=32; cdim2=18; cdim3 =8; kernel_sz=3; dropout=0.13; ldim=50; lrate = 0.0001
my_nn = Net(cdim1=cdim1, cdim2=cdim2,cdim3 =cdim3, kernel_sz=kernel_sz, dropout=dropout, ldim=ldim)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"
)
my_nn.to(device)
optimizer = optim.Adam(my_nn.parameters(), lr=lrate)
criterion = nn.MSELoss(reduction = 'sum')

writer = SummaryWriter('runs/cnn_alldata')
write_index = 0

c1 dim: 19
mp0 dim: 17
c2 dim: 15
mp1 dim: 13
c3 dim: 11
mp2 dim: 9
flattened_dim 648


In [None]:
N_EPOCHS = 4000

test_loss = []
train_loss = []

#@title Run net
for epoch in range(N_EPOCHS):
    running_loss = 0.0
    for i, data in enumerate(mini_loader, 0):
        optimizer.zero_grad()
        inputs, labels = data[0].to(device), data[1].to(device)

        outputs = my_nn(inputs).squeeze()
#         print(outputs.shape, labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
#         for name, param in my_nn.named_parameters():
#             print(name, param.grad.abs().sum())

        running_loss += loss.item()
        if (i + 1) % 100 == 0:
            writer.add_scalar('Loss/train', running_loss , write_index)
        write_index += 1

#     val_acc = get_accuracy(my_nn, mini_x, mini_y)
    train_loss.append(running_loss/train_rows)
    writer.add_scalar('Acc/val', train_loss[-1], write_index)
    
    #calculate test loss.
    with torch.no_grad():
        
        running_tar_loss = 0
        for data, target in test_loader:
            running_tar_loss += get_accuracy(my_nn, data, target.to(device))


        test_loss.append(running_tar_loss / test_rows)
        writer.add_scalar('Test MSE', test_loss[-1], write_index)

    print(epoch, '/', N_EPOCHS,
          'complete, train:', round(np.sqrt(train_loss[-1]), 4),
          "test:", round(np.sqrt(test_loss[-1]), 4) )
            
            
            

    
    
writer.close()
print('Finished Training')
# 10.5, 9.63 best

0 / 4000 complete, train: 12.9626 test: 10.7748
1 / 4000 complete, train: 12.106 test: 10.6647
2 / 4000 complete, train: 12.0032 test: 10.5984
3 / 4000 complete, train: 11.9377 test: 10.5417
4 / 4000 complete, train: 11.8923 test: 10.5064
5 / 4000 complete, train: 11.8391 test: 10.4786
6 / 4000 complete, train: 11.7941 test: 10.4551
7 / 4000 complete, train: 11.758 test: 10.4299
8 / 4000 complete, train: 11.7274 test: 10.4106
9 / 4000 complete, train: 11.7085 test: 10.4117
10 / 4000 complete, train: 11.6931 test: 10.3759
11 / 4000 complete, train: 11.6645 test: 10.3713
12 / 4000 complete, train: 11.6433 test: 10.3525
13 / 4000 complete, train: 11.6373 test: 10.3407
14 / 4000 complete, train: 11.6108 test: 10.3436
15 / 4000 complete, train: 11.6044 test: 10.3306
16 / 4000 complete, train: 11.5938 test: 10.3162
17 / 4000 complete, train: 11.5794 test: 10.3188
18 / 4000 complete, train: 11.5567 test: 10.3009
19 / 4000 complete, train: 11.5557 test: 10.2849
20 / 4000 complete, train: 11.54

In [None]:
torch.cuda.empty_cache()


### Checking accuracy

In [None]:
path = "C:/Users/Matt/Dropbox/SnowComp/RunGraphs/" 

suffix = "_" + \
    str(cdim1)+ "_" + str(cdim2)+ "_" +str(cdim3)+ "_" + str(kernel_sz)+ \
    "_" + str(dropout)+ "_" + str(ldim)+ "_" + str(epoch) +"_" + str(lrate)

plt.plot(range(epoch+1), np.sqrt(train_loss), label ="train")
plt.plot(range(epoch+1), np.sqrt(test_loss), label ="test")
plt.legend()
plt.savefig(path +"converge_alldata" + suffix+ ".png")

## Save Model and Predictions

### TODO: save image quality for random forest ingestion

In [None]:
pred_all = True

Rerun model on full dataset

In [None]:
del mini_x, mini_dataset, mini_loader

full_x = np.concatenate((train_dataset, dataset_test), axis = 0)
full_y = pd.concat((train_y, y_test), axis= 0)

#shuffle
full_y = full_y.reset_index(drop=True)

p = rng.permutation(len(full_x))
full_y = full_y.loc[p]
full_x = full_x[p]

full_y_og = full_y.copy()

In [None]:
full_y = np.array(full_y['snowpack'])

full_x, full_y = torch.Tensor(full_x), torch.Tensor(full_y)
full_dataset = TensorDataset(full_x, full_y)
full_loader = DataLoader(full_dataset, batch_size=128)

# del train_dataset, dataset_test, train_y, y_test

In [None]:
#@title Setup net
my_nn = Net(cdim1=cdim1, cdim2=cdim2,cdim3 =cdim3, kernel_sz=kernel_sz, dropout=dropout, ldim=ldim)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
my_nn.to(device)

optimizer = optim.Adam(my_nn.parameters(), lr=lrate)
criterion = nn.MSELoss(reduction = 'sum')

In [None]:
train_loss = []

#@title Run net
for epoch in range(N_EPOCHS):
    running_loss = 0.0
    for i, data in enumerate(full_loader, 0):
        optimizer.zero_grad()
        inputs, labels = data[0].to(device), data[1].to(device)

        outputs = my_nn(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    train_loss.append(running_loss/train_rows)
    print(epoch, '/', N_EPOCHS,
          'complete, train:', round(np.sqrt(train_loss[-1]), 4),
          "test:", round(np.sqrt(test_loss[-1]), 4) )

In [None]:
torch.cuda.empty_cache()

In [None]:
path = "C:/Users/Matt/Dropbox/SnowComp/RunGraphs/" 

suffix = "_" + \
    str(cdim1)+ "_" + str(cdim2)+ "_" +str(cdim3)+ "_" + str(kernel_sz)+ \
    "_" + str(dropout)+ "_" + str(ldim)+ "_" + str(epoch) +"_" + str(lrate) 

plt.plot(range(epoch+1), np.sqrt(train_loss), label ="train")
plt.legend()
plt.savefig(path +"full" + suffix+ ".png")

### Save predictions

In [None]:
if pred_all:
    my_nn.eval()
    my_nn.to('cpu')

    torch.save(my_nn.state_dict(), path +"model"+suffix)
    
    with torch.no_grad():
        vals = my_nn(full_x).detach().numpy()
        
    path_preds = "C:/Users/Matt/Dropbox/SnowComp/preds/" 
    np.save(path_preds+"preds.npy", vals)
    full_y_og.to_csv(path_preds+"yvals.csv")
    
    #Predict and save on submission data
    vals_sub = []

    sub_dataset = torch.Tensor(sub_dataset)
    sub_ds = TensorDataset(sub_dataset)
    sub_loader = DataLoader(sub_ds, batch_size=50000)

    with torch.no_grad():
        for images in sub_loader:
            vals_sub.append(my_nn(images[0]).detach().numpy()) 

    vals_sub = np.concatenate(vals_sub, axis = 0)        
    np.save(path_preds+"subpred.npy", vals_sub)
    
    #calculate image quality
    image_qual = full_x.detach().numpy()
    image_qual = np.sum(image_qual, axis= (1,2,3))
    np.save(path_preds +"image_qual_train.npy", image_qual)
    
    image_qual = np.sum(sub_dataset.detach().numpy(), axis= (1,2,3))
    np.save(path_preds +"image_qual_sub.npy", image_qual)    
    

## Things to check

1. Accuracy measures are right
2. Check missing value

### TODO:
- Add test set/cv
- Batchnorm?
- CNN benchmark