# Drive

In [1]:
#@title Mount drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Setup

In [2]:
#@title Installs
# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal &> /dev/null
# Install rtree - Geopandas requirment
!apt install python3-rtree &> /dev/null
# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git &> /dev/null
# Install descartes - Geopandas requirment
!pip install descartes &> /dev/null
!pip install geopandas rioxarray &> /dev/null

In [3]:
#@title Imports
import gdal
import itertools
import math
import os
import pickle
import random
import time

import numpy as np
import geopandas as gpd
import pandas as pd

from datetime import datetime, timedelta
from osgeo import gdal, gdalconst
from scipy.interpolate import UnivariateSpline
from scipy.stats import pearsonr
from sklearn import linear_model, metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression

import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

pd.options.mode.chained_assignment = None 
DATA_PATH = 'drive/MyDrive/fall21/snowcast/data/'
WEATHER_DIR = DATA_PATH + 'weather/'
PRED_PATH = DATA_PATH + 'predictions/'

START_DATE = datetime(2012, 6, 30)
END_DATE = datetime(2021, 6, 29)

gdal.UseExceptions()

# Get main data sources

In [4]:
#@title Read in data
metadata = pd.read_csv(DATA_PATH + 'ground_measures_metadata.csv')
train_inp = pd.read_csv(DATA_PATH + 'ground_measures_train_features.csv')
test_inp = pd.read_csv(DATA_PATH + 'ground_measures_test_features.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')
grid_cells = gpd.read_file(DATA_PATH + 'grid_cells.geojson')
submission_format = gpd.read_file(DATA_PATH + 'submission_format.csv')

In [6]:
#@title Get metadata for grid cells
cell_metadata = grid_cells
cell_metadata['centroid'] = cell_metadata['geometry'].centroid
cell_metadata['longitude'] = cell_metadata['centroid'].x
cell_metadata['latitude'] = cell_metadata['centroid'].y
cell_metadata = cell_metadata[['cell_id', 'longitude', 'latitude']]


  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
#@title Format dfs
def get_rmse(df, actual='actual_snowpack', predicted='snowpack'):
    return ((df[actual] - df[predicted]) ** 2).mean() ** 0.5

def pivot_df(df, id_col, ignore_cols=None):
    if not ignore_cols:
        ignore_cols = []
    date_cols = [x for x in df.columns if x not in [id_col] + ignore_cols]
    dfs = []
    for day in date_cols:
        day_df = df[[id_col, day]].rename({day: 'snowpack'}, axis=1)
        day_df['date'] = day
        dfs.append(day_df)
    return pd.concat(dfs)

def get_day_of_season(doy):
    return doy + 365 - 335 if doy < 335 else doy - 335

def add_time_cols(df):
    df['date'] = pd.to_datetime(df['date'])
    df['doy'] = df['date'].dt.dayofyear
    df['dos'] = df['doy'].apply(get_day_of_season)
    df['year'] = df['date'].dt.year
    df['season'] = df['year']
    df.loc[df['doy'] < 335, 'season'] -= 1
    return df

def clean_train_test(df, id_col='station_id', metadata_df=None):
    df = pivot_df(df, id_col)
    if metadata_df is not None:
        df = df.merge(metadata_df)
    return add_time_cols(df)


train = clean_train_test(train_inp.rename({'Unnamed: 0': 'station_id'}, axis=1),
                         metadata_df=metadata)
train2 = clean_train_test(train_labels, 'cell_id', cell_metadata).dropna()
train_full = pd.concat([train2.rename({'cell_id': 'station_id'}, axis=1).assign(datatype='labels'),
                        train.drop(['elevation_m', 'name'], axis=1).assign(datatype='ground')])

test = clean_train_test(
    test_inp.rename({'Unnamed: 0': 'station_id'}, axis=1), metadata_df=metadata)\
    .rename({'snowpack': 'actual_snowpack'}, axis=1).dropna()\
    .merge(train[['station_id', 'state']].drop_duplicates())

to_predict = clean_train_test(submission_format.drop('geometry', axis=1), 'cell_id', cell_metadata)

# round(test['actual_snowpack'].std(), 3)

In [8]:
#@title Misc helpers
def rmse(a, p):
    return round(((a - p) ** 2).mean() ** 0.5, 4)

def quick_predict(train_df, test_df, cols, get_preds=False):
    lm = LinearRegression().fit(train_df[cols], train_df['snowpack'])
    train_rmse = rmse(train_df['snowpack'], lm.predict(train_df[cols]))
    test_preds = lm.predict(test_df[cols])
    if get_preds:
        return train_rmse, test_preds
    else:
        test_rmse = rmse(test_df['actual_snowpack'], test_preds)
        corr = round(pearsonr(test_df['actual_snowpack'], test_preds)[0], 4)
        return train_rmse, test_rmse, corr


def interact(df, cont_cols, dummy_col):
    keep_cols = list(set(df.columns) - set([dummy_col] + cont_cols))
    dummies = pd.get_dummies(df, columns=[dummy_col])
    dummy_cols = [x for x in dummies.columns if x.startswith(dummy_col)]
    int_df = dummies[keep_cols]

    for cc in cont_cols:
        for dc in dummy_cols:
            int_df[cc + '_' + dc] = dummies[cc] * dummies[dc]

    return int_df

def get_tt_cols(df):
    df['dos_2'] = df['dos'] ** 2
    return interact(df, ['dos', 'dos_2'], 'state')

def write_formatted_preds(preds_df, outpath):
    preds_df = preds_df[['cell_id', 'date', 'snowpack']]
    preds_df['date'] = preds_df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
    submission = preds_df.pivot(index='cell_id', columns='date', values='snowpack')\
                        .reset_index()\
                        .sort_values('cell_id')

    assert sorted(submission.columns) == sorted(submission_format.columns[:-1])
    assert sorted(submission['cell_id']) == sorted(submission_format['cell_id'])

    submission.to_csv(PRED_PATH + '%s.csv' % outpath, index=False)

def format_state(orig_df):
    df = orig_df.dropna()
    df['dos_2'] = df['dos'] ** 2
    return interact(df, ['dos', 'dos_2'], 'state')






# Workspace

In [9]:
#@title Get MODIS CNN predictions
yvals = pd.read_csv(PRED_PATH + 'nnet/preds_0204/yvals.csv').drop('Unnamed: 0', axis=1)
preds = np.load(PRED_PATH + 'nnet/preds_0204/preds.npy')

yvals['modis_pred'] = preds
modis_train_preds = train_full.copy()
modis_train_preds['date_temp'] = modis_train_preds['date']
modis_train_preds['date'] = modis_train_preds.apply(
    lambda x: int(str(x['year']) + str(x['doy']).zfill(3)), axis=1)
modis_train_preds = modis_train_preds.merge(yvals.rename({'cell_id': 'station_id'}, axis=1))\
    .drop('date', axis=1).rename({'date_temp': 'date'}, axis=1)


modis_test_preds = test.drop(['elevation_m', 'name'], axis=1)\
    .rename({'actual_snowpack': 'snowpack'}, axis=1)
modis_test_preds['date_temp'] = modis_test_preds['date']
modis_test_preds['date'] = modis_test_preds.apply(
    lambda x: int(str(x['year']) + str(x['doy']).zfill(3)), axis=1)
modis_test_preds = modis_test_preds.merge(yvals.rename({'cell_id': 'station_id'}, axis=1))\
    .drop('date', axis=1).rename({'date_temp': 'date'}, axis=1)

modis_all_preds = pd.concat([modis_train_preds, modis_test_preds])


In [10]:
#@title Get sentinel CNN predictions
folder = 'preds_0214'

preds = np.load(PRED_PATH + 'nnet/%s/sentpreds.npy' % folder)
yvals = pd.read_csv(PRED_PATH + 'nnet/%s/sent_ymeta.csv' % folder).drop('Unnamed: 0', axis=1)
yvals['sat_pred'] = preds
yvals['date'] = pd.to_datetime(yvals['date'])

sat_train_preds = train_full.merge(modis_train_preds[['station_id', 'date', 'modis_pred']])
sat_train_preds = sat_train_preds.merge(yvals.rename({'cell_id': 'station_id'}, axis=1))

sat_test_preds = test.drop(['elevation_m', 'name'], axis=1)\
    .rename({'actual_snowpack': 'snowpack'}, axis=1)\
    .merge(yvals.rename({'cell_id': 'station_id'}, axis=1))\
    .merge(modis_test_preds[['station_id', 'date', 'modis_pred']])
sat_all_preds = pd.concat([sat_train_preds, sat_test_preds])



In [None]:
#@title Get MODIS submission CNN predictions
nn_sub_preds = np.load(PRED_PATH + 'nnet/preds_0204/subpred.npy')

def format_df(df):
    df['dos_2'] = df['dos'] ** 2
    return df


path_id = DATA_PATH + 'modis/cell_ids_sub.pkl'
with open(path_id, 'rb') as handle:
    cell_ids = pickle.load(handle)

modis_pred_df = format_df(to_predict)
sub_order = pd.DataFrame({'identifier': ['-'.join(x) for x in cell_ids],
                          'order': [x for x in range(len(cell_ids))]})
modis_pred_df['identifier'] = modis_pred_df.apply(
    lambda x: x['cell_id'] + '-' + str(x['year']) + str(x['doy']).zfill(3), axis=1)
modis_pred_df = modis_pred_df.merge(sub_order).sort_values('order')\
    .assign(modis_pred=nn_sub_preds)


In [16]:
#@title Get Sentinel submission CNN predictions
nn_sub_preds = np.load(PRED_PATH + 'nnet/%s/sent_subpred.npy' % folder)
sub_yvals = pd.concat([pd.read_csv(PRED_PATH + 'nnet/%s/sent_sub1_meta.csv' % folder),
                       pd.read_csv(PRED_PATH + 'nnet/%s/sent_sub2_meta.csv' % folder)])
sub_yvals['sat_pred'] = nn_sub_preds
sub_yvals['date'] = pd.to_datetime(sub_yvals['date'])

pred_df = format_df(to_predict).merge(sub_yvals[['cell_id', 'date', 'sat_pred']])\
    .merge(modis_pred_df[['cell_id', 'date', 'modis_pred']])



In [17]:
#@title Write predictions
cols = ['dos', 'dos_2', 'modis_pred', 'sat_pred']

for df in [sat_train_preds, sat_test_preds, sat_all_preds]:
    df['dos_2'] = df['dos'] ** 2

lm = LinearRegression().fit(sat_all_preds[cols],
                            sat_all_preds['snowpack'])
pred_df['snowpack'] = lm.predict(pred_df[cols])

write_formatted_preds(pred_df[['cell_id', 'snowpack', 'date']], 'preds')