# Pre-preprocessing Sentinel 1

list of `cell_id, date, lat, lon` for downloading

In [14]:
import pandas as pd
import geojson as gsn
from pyproj import Proj
from osgeo import gdal
from osgeo import gdalconst

import tempfile
import wget
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import os
import pickle
from collections import defaultdict
from datetime import datetime, timedelta

In [15]:
def daynum_gen(date_time):
    '''converts date time objects to filename'''
    date_time = datetime.fromisoformat(date_time)
    doy = date_time.timetuple().tm_yday
    year = date_time.year
    return str(year) + '{:03d}'.format(doy)



In [40]:
train_feat = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/ground_measures_train_features.csv")
test =pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/ground_measures_test_features.csv")
submission = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/submission_format.csv")
train_y = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/train_labels.csv") 
metadata = pd.read_csv("C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/ground_measures_metadata.csv")

In [41]:
path = "C:/Users/Matt/Documents/Python Scripts/SnowComp/dat/grid_cells.geojson"
with open(path) as f:
    gj = gsn.load(f)
print(len(gj['features']))

18130


In [42]:
centroids = {} #cellid : centroid

for cell in range(len(gj['features'])):
    assert len(gj['features'][cell]['geometry']['coordinates'][0]) == 5 #coordinates have repeat on fifth, make sure this is universal
    
    cell_id =gj['features'][cell]['properties']['cell_id']
    centroid = list(np.mean(
        gj['features'][cell]['geometry']['coordinates'][0][0:4],
        axis = 0)) #lazy centroid calculation
    centroids[cell_id] = centroid

In [43]:
#load cell ids, note Ts are correct, As are actual daynums 
path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_idsT.pkl"
with open(path_id, 'rb') as handle:
    cell_ids = pickle.load( handle)

path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_ids_trainfeat.pkl"
with open(path_id, 'rb') as handle:
    cell_ids_train = pickle.load(handle)
    
path_id = "C:/Users/Matt/Dropbox/SnowComp/cell_snow_ids_testfeat.pkl"
with open(path_id, 'rb') as handle:
    cell_ids_test = pickle.load( handle)

## Train labels

In [49]:
def date_loc_gen(dataframe, cell_ids = None, metadata= None, centroids=centroids):
    #grab date/cell_id combos
    dates = {}

    #go through all rows, grab nonnull dates
    for i in range(dataframe.shape[0]):
        dates[dataframe.iloc[i,0]] =list(
            dataframe.iloc[i,1:][~dataframe.iloc[i,:].isnull()].index)#cell_id : [dates]

    if metadata is not None:
        dataframe = dataframe.rename({'Unnamed: 0': 'station_id'}, axis=1)
        dataframe = dataframe.merge(metadata)
        
        #get coordinates into correct format
        a= dataframe.set_index("station_id")
        a= a[['latitude','longitude']]
        centroids = {}
        
        for i in range(a.shape[0]):
            row = a.iloc[i,:]
            centroids[row.name]= [row[1],row[0]]
            
        
    date_locs = []

    # create a numpy array date_locs (cell_id, date, lat, lon)
    counter = 0 
    for cell, date_list in tqdm(dates.items()):
        for date in date_list:        
            date_locs.append([cell, date, centroids[cell][1], centroids[cell][0]])

            counter += 1
    print("total squares:", counter)    
    
    date_locs = pd.DataFrame(date_locs, columns = ['cell_id', 'date', 'lat', 'lon'])
        
    date_locs['date_long']=date_locs['date'].map(daynum_gen)

#     sorter = [idx +"-" +date for  idx, date  in cell_ids]
#     date_locs['idx'] = date_locs['cell_id'] +"-"+date_locs['date_long']
#     date_locs = date_locs.set_index('idx')
#     date_locs = date_locs.loc[sorter]    
#     date_locs.reset_index(inplace=True, drop=True)
#     date_locs.drop("date_long", inplace =True, axis=1)
    
    return date_locs

In [21]:
date_locs = date_loc_gen(train_y, cell_ids)
date_locs = date_locs.sort_values("date_long")
print(date_locs.shape)

# April 3, 2014 is 2014093, sentinel's launch date
# But November 1st, 2014 is 2014305, first day of data (-30d)
first_day = 2014305
date_locs = date_locs[date_locs.date_long >= str(first_day)]

print("after", date_locs.shape)

  0%|          | 0/10878 [00:00<?, ?it/s]

total squares: 91490
(91490, 5)
after (76410, 5)


In [22]:
sent_path = "C:/Users/Matt/Dropbox/SnowComp/SentinelHelper/"
date_locs.to_csv(sent_path + "ylabs_dateloc.csv", index= False)

In [23]:
unique_dates = date_locs.date.unique()
pd.Series(unique_dates, name = "date").to_csv(sent_path + "date_list_ylabs.csv", index=False)

In [52]:
# date_locs
# pd.Series(unique_dates, name = "date")

## Train_features

In [51]:
date_loc_tf = date_loc_gen(train_feat, metadata=metadata)

  0%|          | 0/700 [00:00<?, ?it/s]

total squares: 144015


In [53]:
date_loc_tf  = date_loc_tf .sort_values("date_long")
print(date_loc_tf.shape)

# April 3, 2014 is 2014093, sentinel's launch date
# But November 1st, 2014 is 2014305, first day of data (-30d)
first_day = 2014305
date_loc_tf  = date_loc_tf[date_loc_tf.date_long >= str(first_day)]

print("after", date_loc_tf.shape)

(144015, 5)
after (106760, 5)


In [54]:
sent_path = "C:/Users/Matt/Dropbox/SnowComp/SentinelHelper/"
date_loc_tf.to_csv(sent_path + "trainfeat_dateloc.csv", index= False)

unique_dates = date_loc_tf.date.unique()
pd.Series(unique_dates, name = "date").to_csv(sent_path + "date_list_trainfeat.csv", index=False)

## Test_features

In [57]:
date_loc_test = date_loc_gen(test, metadata=metadata)

date_loc_test  = date_loc_test.sort_values("date_long")
print(date_loc_test.shape)

#note no filtering is needed because these are later dates

sent_path = "C:/Users/Matt/Dropbox/SnowComp/SentinelHelper/"
date_loc_test.to_csv(sent_path + "testfeat_dateloc.csv", index= False)

unique_dates = date_loc_test.date.unique()
pd.Series(unique_dates, name = "date").to_csv(sent_path + "date_list_testfeat.csv", index=False)

  0%|          | 0/700 [00:00<?, ?it/s]

total squares: 38628
(38628, 5)
after (38628, 5)


## Submission dataset

In [69]:
half = int(submission.shape[0]/2)
print(submission.shape)
print(submission.iloc[0:half].shape)
print(submission.iloc[half:].shape)
sub_all = [submission.iloc[0:half], submission.iloc[half:]]

(9066, 58)
(4533, 58)
(4533, 58)


In [71]:
for i, ds in enumerate(sub_all):
    date_loc_sub = date_loc_gen(ds)

    date_loc_sub  = date_loc_sub.sort_values("date_long")
    print(date_loc_sub.shape)

    #note no filtering is needed because these are later dates

    sent_path = "C:/Users/Matt/Dropbox/SnowComp/SentinelHelper/"
    date_loc_sub.to_csv(sent_path + "sub_dateloc"+ "{}of{}".format(i+1,len(sub_all))+
                        ".csv", index= False)
 
    
#this will be the same for all data points    
unique_dates = date_loc_sub.date.unique()
pd.Series(unique_dates, name = "date").to_csv(sent_path + "date_list_sub.csv", index=False)

  0%|          | 0/4533 [00:00<?, ?it/s]

total squares: 258381
(258381, 5)


  0%|          | 0/4533 [00:00<?, ?it/s]

total squares: 258381
(258381, 5)
