In [None]:
!apt-get install -qq libgdal-dev libproj-dev
!pip install --no-binary shapely shapely --force
!pip install cartopy
!pip install regionmask

In [None]:
#import required packages
import os
import warnings
import time
import regionmask
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import geopandas as gpd
import cartopy.crs as ccrs
import shapely
from datetime import datetime as dt
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from scipy.interpolate import interp1d
from dateutil.relativedelta import relativedelta
from google.colab import drive
from google.colab import files

In [None]:
#establish working directory and mount drive
drive.mount('/content/drive')
working_directory = '/content/drive/My Drive/COS Seesaw Research'

Parallel site arrays for constructing regions, and time delta for building feature time series in addition, contains the radius for defining map regions

In [None]:
cos_sites = ['alt', 'brw', 'cgo', 'hfm', 'kum', 'lef', 'mhd', 'mlo', 'nwr', 'psa', 'smo', 'spo', 'sum', 'thd']
cos_site_centers = [(-62.3, 82.5), (-156.6, 71.3), (144.7,-40.7), (-72.2, 42.5), (-154.8, 19.5), (-90.3, 45.6), (-9.9, 53.3), (-155.6, 19.5), (-105.5, 40.1), (-64.0, -64.6), (-170.6, -14.2), (0, -90), (-38.4, 72.6), (-124.1,41.0)]
time_delta_general = [('-15d', relativedelta(days=-15)), ('-1m', relativedelta(months=-1)), ('-1m15d', relativedelta(months=-1, days=-15)), ('-2m', relativedelta(months=-2))]
region_size = 30
year_start = 2000
year_end = 2018
divider = ('-------------------------------------------------------------------------------------------------------')

Build regions

In [None]:
regions = None
names = []
abbrevs = []
region_list = []

for i in range(len(cos_sites)):
  names.append(cos_sites[i])
  abbrevs.append(cos_sites[i])
  center_point = Point(cos_site_centers[i][0], cos_site_centers[i][1])
  circle = center_point.buffer(region_size)

  #region_bound = np.array([list(cos_site_centers[i])])
  
  #region_list.append(region_bound)
  region_list.append(circle)

#regions = regionmask.Regions(region_list, names=names, abbrevs=abbrevs, name='Ocean Regions')
regions = regionmask.Regions(region_list, names=names, abbrevs=abbrevs, name='Ocean Regions', overlap=True)
plt.figure(figsize=(24,12))
regions.plot(label='abbrev')
plt.show()

In [None]:
site_frames = {}
file_name = working_directory + '/Data/OCS/OCS__GCMS_flask.txt'
time_column_name = 'yyyymmdd'
cos_data_full = pd.read_csv(file_name, delim_whitespace=True, header=1, parse_dates=[time_column_name])
for site in cos_sites:
  print('Initializing data frame for ', site)
  cos_data = cos_data_full.loc[cos_data_full['site'] == site]
  duplicates = cos_data.duplicated(keep=False, subset=[time_column_name])
  duplicate_entries = cos_data.where(duplicates)
  duplicate_entries.dropna(inplace=True)
  unique_dates = duplicate_entries[time_column_name].unique()
  same_day_avg = []
  for date in unique_dates:
    entry_subset = duplicate_entries.where(duplicate_entries[time_column_name] == date)
    entry_subset.dropna(inplace=True)
    ocs_col = entry_subset['OCS_']
    mean = ocs_col.mean()
    same_day_avg.append((date,mean))
  cos_data = cos_data.drop_duplicates(subset=[time_column_name])
 
  for avg in same_day_avg:
    cos_data.loc[cos_data[time_column_name] == avg[0], 'OCS_'] = avg[1]
  
  cos_data = cos_data[(cos_data[time_column_name] >= dt(year=year_start, month=1, day=1)) & (cos_data[time_column_name] < dt(year=year_end+1, month=1, day=1))]
  cos_column_name = 'COS_' + site
  cos_data = pd.DataFrame({'time':cos_data[time_column_name], cos_column_name : cos_data['OCS_'], 'OCS_stddev' : cos_data['OCS__sd']})
  cos_data = cos_data.reset_index(drop=True)

  # add it to the dictionary
  #data_set = xr.Dataset({'time': xr.Variable(cos_data['time'])})
  site_frames[site] = cos_data.to_xarray()
  new_frame = site_frames[site].swap_dims({'index': 'time'})
  site_frames[site] = new_frame

In [None]:
interpolated_frames = {}
# build set of regular dates
days = pd.date_range(start=dt(year=2005, month=1, day=1), end=dt(year=2017, month=1, day=1), freq='SM')
print(days)
#date_frame = pd.DataFrame()
#date_frame['time'] =
date_frame = pd.DataFrame(data=days, columns=['time'])
print(date_frame)
for site in site_frames.keys():
  cos_name = 'COS_' + site
  print(site_frames[site])
  interp = site_frames[site][cos_name].interp(time=date_frame['time'], method='cubic')
  print(interp)
  interpolated_frames[site] = interp

In [None]:
for site in site_frames.keys():
  #cos_data = cos_data[(cos_data['time'] >= dt(year=year_start, month=1, day=1)) & (cos_data['time'] < dt(year=year_end+1, month=1, day=1))]
  print(site_frames[site])
  #print(interpolated_frames[site]['time'])
  #true_data = site_frames[site][(site_frames[site]['time'] >= dt(year=2005, month=1, day=1)) & (site_frames[site]['time'] < dt(year=2016, month=1, day=1))]
  #print(site_frames[site]['time'])
  fig, ax = plt.subplots(figsize=(12,6))
  ocs_name = 'COS_' + site
  ax.plot(site_frames[site]['time'], site_frames[site][ocs_name], label = 'True OCS')
  ax.plot(interpolated_frames[site]['time'], interpolated_frames[site], label = 'Interpolated OCS')
  plt.legend()
  plt.show()

In [None]:
pickle_name = working_directory + '/Data/Pickles/correlation_frame.pkl'
feature_pickle = pd.read_pickle(pickle_name)
display(feature_pickle)

In [None]:
for site in interpolated_frames.keys():
  #print(interpolated_frames[site])
  #ocs_frame = interpolated_frames[site].to_dataframe()
  ocs_frame = interpolated_frames[site]
  working_df = feature_pickle.copy()
  cos_name = 'COS_' + site
  working_df[cos_name] = ocs_frame.data
  display(working_df)
  pickle_name = working_directory + '/Data/Pickles/correlation_pickles/' + site + '_dataframe.pkl'
  working_df.to_pickle(pickle_name)


In [None]:
drive.flush_and_unmount()