In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import geopandas as gpd
import time

from datetime import datetime
import pickle
import random
import json

tqdm.pandas() 

rng = np.random.default_rng(342834)

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
#labels helpers and processing
def pivot_df(df, id_col, ignore_cols=None):
    if not ignore_cols:
        ignore_cols = []
    date_cols = [x for x in df.columns if x not in [id_col] + ignore_cols]
    dfs = []
    for day in date_cols:
        day_df = df[[id_col, day]].rename({day: 'snowpack'}, axis=1)
        day_df['date'] = day
        dfs.append(day_df)
    return pd.concat(dfs)

def daynum_gen(date_time):
    '''converts date time objects to filename'''
    date_time = datetime.fromisoformat(date_time)
    doy = date_time.timetuple().tm_yday
    year = date_time.year
    return str(year) + '{:03d}'.format(doy)

# Get ordered elevation training data
def add_elevation(order, modis):
    order = pd.DataFrame({'modis_idx': order, 'order': [x for x in range(len(order))]})
    order['station_id'] = order['modis_idx'].apply(lambda x: '-'.join(x.split('-')[:-1]))
    order = order.merge(elev_order).sort_values('order')
    ordered_elev = elevation[order['DEM_order'].to_list(), :, :]
    dim = ordered_elev.shape

    return np.concatenate([modis, ordered_elev.reshape(dim[0], 1, dim[1], dim[2])], axis=1)



In [4]:
## Sentinel Helpers
def y_merger(x, y):
    '''reattach y labels to sentinel'''
    y = y.rename(columns={"Unnamed: 0":"cell_id"})
    y = pivot_df(y, 'cell_id').dropna()
    y['date']=y['date'].map(daynum_gen)

    y['idx'] = y['cell_id'] + "-" + y['date']
    y = y.set_index('idx')

    x['idx'] = x['cell_id'] +\
         "-" +x['date_long'].astype(str)
    x = x.set_index('idx')

    return x.join(y['snowpack'])

#preprocessing helpers
def masker(x,y):
#     return x , y
    mask = np.all(x > -99, axis = (1,2))
    print(mask.sum(), "of", len(mask))
    
    return x[mask], y[mask]

def minmaxscaler(x):
    print("min", round(x.min(),3), "max", round(x.max(),3))
    x = (x - x.min())/(x.max() - x.min())
                   
    return x

def reshaper(ds):
    #readjust dimensions
    dim0 = ds.shape[0]
    dim1 = ds.shape[1]
    dim2 = ds.shape[2]

    return ds.reshape((dim0, 1, dim1, dim2))

In [5]:
DATA_PATH = "C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/"

#@title Original data
metadata = pd.read_csv(DATA_PATH + 'ground_measures_metadata.csv')
train_inp = pd.read_csv(DATA_PATH + 'ground_measures_train_features.csv')
test_inp = pd.read_csv(DATA_PATH + 'ground_measures_test_features.csv')
train_labels = pd.read_csv(DATA_PATH + 'train_labels.csv')
grid_cells = gpd.read_file(DATA_PATH + 'grid_cells.geojson')
submission_format = gpd.read_file(DATA_PATH + 'submission_format.csv')

In [6]:
#@title Get metadata for grid cells
states = gpd.read_file('C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/states/')\
    .rename({'NAME': 'state'}, axis=1)
states = states.to_crs('EPSG:4326')

cell_metadata = gpd.sjoin(grid_cells, states[['geometry', 'state']])\
    .drop_duplicates(subset='cell_id')\
    .drop(['index_right'], axis=1)
cell_metadata['centroid'] = cell_metadata['geometry'].centroid
cell_metadata['longitude'] = cell_metadata['centroid'].x
cell_metadata['latitude'] = cell_metadata['centroid'].y
cell_metadata = cell_metadata[['cell_id', 'state', 'longitude', 'latitude']]


  cell_metadata['centroid'] = cell_metadata['geometry'].centroid


In [7]:
def get_rmse(df, actual='actual_snowpack', predicted='snowpack'):
    return ((df[actual] - df[predicted]) ** 2).mean() ** 0.5

def pivot_df(df, id_col, ignore_cols=None):
    if not ignore_cols:
        ignore_cols = []
    date_cols = [x for x in df.columns if x not in [id_col] + ignore_cols]
    dfs = []
    for day in date_cols:
        day_df = df[[id_col, day]].rename({day: 'snowpack'}, axis=1)
        day_df['date'] = day
        dfs.append(day_df)
    return pd.concat(dfs)

def get_day_of_season(doy):
    return doy + 365 - 335 if doy < 335 else doy - 335

def add_time_cols(df):
    df['date'] = pd.to_datetime(df['date'])
    df['doy'] = df['date'].dt.dayofyear
    df['dos'] = df['doy'].apply(get_day_of_season)
    df['year'] = df['date'].dt.year
    df['season'] = df['year']
    df.loc[df['doy'] < 335, 'season'] -= 1
    return df

def clean_train_test(df, id_col='station_id', metadata_df=None):
    df = pivot_df(df, id_col)
    if metadata_df is not None:
        df = df.merge(metadata_df)
    return add_time_cols(df)


train = clean_train_test(train_inp.rename({'Unnamed: 0': 'station_id'}, axis=1),
                         metadata_df=metadata)
train2 = clean_train_test(train_labels, 'cell_id', cell_metadata).dropna()
train_full = pd.concat([train2.rename({'cell_id': 'station_id'}, axis=1).assign(datatype='labels'),
                        train.drop(['elevation_m', 'name'], axis=1).assign(datatype='ground')])

test = clean_train_test(
    test_inp.rename({'Unnamed: 0': 'station_id'}, axis=1), metadata_df=metadata)\
    .rename({'snowpack': 'actual_snowpack'}, axis=1).dropna()\
    .merge(train[['station_id', 'state']].drop_duplicates())

to_predict = clean_train_test(submission_format.drop('geometry', axis=1), 'cell_id', cell_metadata)

In [8]:
sent_path = "C:/Users/Matt/Dropbox/SnowComp/SentinelHelper/"

sentinel_trainfeat = np.load(sent_path + "sent_pp_trainfeat.npy")
sentinel_testfeat = np.load(sent_path + "sent_pp_testfeat.npy")
sentinel_ylabs = np.load(sent_path + "sent_pp_ylabs.npy")

trainfeat_meta = pd.read_csv(sent_path + "sent_trainfeat_meta.csv")
testfeat_meta = pd.read_csv(sent_path + "sent_testfeat_meta.csv")
ylabs_meta = pd.read_csv(sent_path + "sent_ylabs_meta.csv")

In [9]:
sentinel_ylabs, ylabs_meta = masker(sentinel_ylabs, ylabs_meta)
sentinel_trainfeat, trainfeat_meta = masker(sentinel_trainfeat, trainfeat_meta)
sentinel_testfeat, testfeat_meta = masker(sentinel_testfeat, testfeat_meta)

sentinel_ylabs = minmaxscaler(sentinel_ylabs)
sentinel_trainfeat = minmaxscaler(sentinel_trainfeat)
sentinel_testfeat = minmaxscaler(sentinel_testfeat)

76159 of 76410
105571 of 106760
38618 of 38628
min -57.906 max 18.57
min -50.229 max 19.536
min -27.488 max 14.445


In [10]:
dataset = np.concatenate([sentinel_trainfeat, sentinel_ylabs])
train_y = pd.concat([trainfeat_meta, ylabs_meta])
train_y['date'] = pd.to_datetime(train_y['date'])
train_y = train_y.merge(train_full.rename({'station_id': 'cell_id'}, axis=1)\
                                    [['cell_id', 'snowpack', 'date']])

testfeat_meta['date'] = pd.to_datetime(testfeat_meta['date'])
sentinel_ylab_test = testfeat_meta.merge(
    test.rename({'station_id': 'cell_id', 'actual_snowpack': 'snowpack'}, axis=1)\
    [['cell_id', 'snowpack', 'date']])

In [11]:
# # sentinel_testfeat, sentinel_ylab_test
# del sentinel_ylabs
# del ylabs_meta
# del sentinel_trainfeat
# del trainfeat_meta

In [28]:
train_y['snowpack'] .shape
# ds_mean .shape

(181730,)

In [20]:
ds_mean = dataset.mean(axis = (1,2)).reshape(-1, 1)
s_testfeat_mean = sentinel_testfeat.mean(axis = (1,2)).reshape(-1, 1)


In [23]:
reg = LinearRegression().fit(ds_mean, train_y['snowpack'])
# reg.score(ds_mean, train_y['snowpack'])
y_pred = reg.predict(ds_mean)
((y_pred - train_y['snowpack'])**2).mean()**.5



14.3759932733922

In [29]:
sentinel_ylab_test

Unnamed: 0,cell_id,date,date_long,.geo,snowpack
0,CDEC:SCT,2020-01-07,2020007,"{""type"":""Point"",""coordinates"":[-122.7194819386...",3.24
1,SNOTEL:873_OR_SNTL,2020-01-07,2020007,"{""type"":""Point"",""coordinates"":[-118.1519177111...",4.90
2,SNOTEL:327_CO_SNTL,2020-01-07,2020007,"{""type"":""Point"",""coordinates"":[-107.5121235740...",11.90
3,SNOTEL:1058_CO_SNTL,2020-01-07,2020007,"{""type"":""Point"",""coordinates"":[-106.5378278302...",4.20
4,SNOTEL:936_CO_SNTL,2020-01-07,2020007,"{""type"":""Point"",""coordinates"":[-105.5934526545...",3.60
...,...,...,...,...,...
38613,SNOTEL:523_OR_SNTL,2021-06-29,2021180,"{""type"":""Point"",""coordinates"":[-118.1065686534...",0.00
38614,SNOTEL:1287_MT_SNTL,2021-06-29,2021180,"{""type"":""Point"",""coordinates"":[-113.1224530419...",0.00
38615,SNOTEL:341_OR_SNTL,2021-06-29,2021180,"{""type"":""Point"",""coordinates"":[-122.8548736239...",0.00
38616,SNOTEL:771_CA_SNTL,2021-06-29,2021180,"{""type"":""Point"",""coordinates"":[-119.6003050789...",0.00


In [None]:
# #@title arrange data
# mini_x = np.array(dataset.reshape(dataset.shape[0], 1, 41, 41))
# mini_y = np.array(train_y['snowpack'])

# test_x = np.array(sentinel_testfeat.reshape(sentinel_testfeat.shape[0], 1, 41, 41))
# test_y = np.array(sentinel_ylab_test['snowpack'])

# train_rows = len(mini_x)
# test_rows = len(test_x)