In [1]:
import numpy as np
import cartopy.crs as ccrs
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

## Open dataset and create dataframe of raw data

In [2]:
sumup = xr.open_dataset('sumup_density_2020_v060121.nc')

In [63]:
# Extract data and remove no data
su_elev = sumup['Elevation'].values
su_lat = sumup['Latitude'].values

# Ignore any sea ice or erroneous data
condition = (su_elev>0) & (su_lat>-91) & (su_lat<91)

# Exctract data based on condition
su_lon = sumup['Longitude'].values[condition]
su_depth0 = sumup['Start_Depth'].values[condition]
su_depth1 = sumup['Stop_Depth'].values[condition]
su_midpoint = sumup['Midpoint'].values[condition]
su_density = sumup['Density'].values[condition]
su_citation = sumup['Citation'].values[condition]
su_date = sumup['Date'].values[condition]
su_method = sumup['Method'].values[condition]
su_elev = su_elev[condition]
su_lat = su_lat[condition]

su_data = {'Citation':su_citation,'Timestamp':su_date,'Latitude':su_lat,'Longitude':su_lon,'Elevation':su_elev,
           'Midpoint':su_midpoint,'StartDepth':su_depth0,'StopDepth':su_depth1,'Density':su_density,'Method':su_method}

df = pd.DataFrame(data=su_data)
df

Unnamed: 0,Citation,Timestamp,Latitude,Longitude,Elevation,Midpoint,StartDepth,StopDepth,Density,Method
0,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.050,0.00,0.10,0.3680,1.0
1,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.150,0.10,0.20,0.3810,1.0
2,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.250,0.20,0.30,0.3680,1.0
3,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.350,0.30,0.40,0.3630,1.0
4,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.450,0.40,0.50,0.3890,1.0
...,...,...,...,...,...,...,...,...,...,...
2105887,187.0,20130604.0,72.579781,-38.458630,3210.0,0.765,0.75,0.78,0.2740,3.0
2105888,187.0,20130604.0,72.579781,-38.458630,3210.0,0.795,0.78,0.81,0.2977,3.0
2105889,187.0,20130604.0,72.579781,-38.458630,3210.0,0.825,0.81,0.84,0.3080,3.0
2105890,187.0,20130604.0,72.579781,-38.458630,3210.0,0.855,0.84,0.87,0.3056,3.0


## Fix date issues

Some dates are just years (e.g., '19990000'), some have a day of '32' listed, and a few are just plain incorrect.

In [64]:
su_timestamp = []
for i in range(len(su_date)):
    d = su_date[i]
    date_str = str(d)
    
    # These particular dates appear to be very incorrect
    if date_str == '19999000.0':
        date_str = '19990000.0'
    if date_str == '20089620.0':
        date_str = '20080620.0'
    
    year = date_str[0:4]
    month = date_str[4:6]
    day = date_str[6:8]
    
    # Add Jan 1 to year-only dates, and change any with 32 days to 31 days
    if month == '00':
        month = '01'
    if day == '00':
        day = '01'
    if day == '32':
        day = '31'
    
    d = float(year+month+day)
    su_timestamp.append(d)

su_timestamp = np.array(su_timestamp)

df['Timestamp'] = su_timestamp
df

Unnamed: 0,Citation,Timestamp,Latitude,Longitude,Elevation,Midpoint,StartDepth,StopDepth,Density,Method
0,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.050,0.00,0.10,0.3680,1.0
1,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.150,0.10,0.20,0.3810,1.0
2,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.250,0.20,0.30,0.3680,1.0
3,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.350,0.30,0.40,0.3630,1.0
4,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.450,0.40,0.50,0.3890,1.0
...,...,...,...,...,...,...,...,...,...,...
2105887,187.0,20130604.0,72.579781,-38.458630,3210.0,0.765,0.75,0.78,0.2740,3.0
2105888,187.0,20130604.0,72.579781,-38.458630,3210.0,0.795,0.78,0.81,0.2977,3.0
2105889,187.0,20130604.0,72.579781,-38.458630,3210.0,0.825,0.81,0.84,0.3080,3.0
2105890,187.0,20130604.0,72.579781,-38.458630,3210.0,0.855,0.84,0.87,0.3056,3.0


## Add core IDs

In [65]:
# Create a unique index for each core
n = -1
id0 = []
for i in range(len(su_citation)-1):
    if (su_citation[i]==su_citation[i-1] and su_lat[i]==su_lat[i-1] and su_lon[i]==su_lon[i-1] and su_method[i]==su_method[i-1] and su_timestamp[i]==su_timestamp[i-1]):
        index = n
    else:
        n += 1
        index = n
    id0.append(index)
id0.append(id0[-1])

# Give each datapoint within a core index its own index
m = -1
id1 = []
for i in range(len(id0)-1):
    if id0[i] == id0[i-1]:
        m += 1
    else:
        m = 0
    id1.append(m)
id1.append(id1[-1]+1)

# Set indices in dataframe
df['CoreID'] = id0
df['CoreIdx'] = id1

df

Unnamed: 0,Citation,Timestamp,Latitude,Longitude,Elevation,Midpoint,StartDepth,StopDepth,Density,Method,CoreID,CoreIdx
0,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.050,0.00,0.10,0.3680,1.0,0,0
1,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.150,0.10,0.20,0.3810,1.0,0,1
2,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.250,0.20,0.30,0.3680,1.0,0,2
3,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.350,0.30,0.40,0.3630,1.0,0,3
4,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.450,0.40,0.50,0.3890,1.0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2105887,187.0,20130604.0,72.579781,-38.458630,3210.0,0.765,0.75,0.78,0.2740,3.0,1575,5
2105888,187.0,20130604.0,72.579781,-38.458630,3210.0,0.795,0.78,0.81,0.2977,3.0,1575,6
2105889,187.0,20130604.0,72.579781,-38.458630,3210.0,0.825,0.81,0.84,0.3080,3.0,1575,7
2105890,187.0,20130604.0,72.579781,-38.458630,3210.0,0.855,0.84,0.87,0.3056,3.0,1575,8


In [66]:
# Split dataframe into list of dataframes
dfs = []

for i in range(len(np.unique(df.CoreID))):
    
    dftemp = df[df.CoreID==i]
    dftemp = dftemp.reset_index()
    dftemp = dftemp.drop(columns=['index','CoreIdx'])
    dfs.append(dftemp)

In [67]:
for i in range(len(dfs)):
    
    dftemp = dfs[i]
    
    thicknessD = np.array(dftemp.StopDepth) - np.array(dftemp.StartDepth)
    
    midpoint = np.array(dftemp.Midpoint)
    thicknessM = midpoint[1:]-midpoint[:-1]
    thicknessM = np.insert(thicknessM,0,midpoint[0])
    
    if thicknessD[0] == 0:
        dftemp['Thickness'] = np.array(thicknessM)
    else:
        dftemp['Thickness'] = np.array(thicknessD)

## Remove individual core errors

In [73]:
for i in range(len(dfs)):
    
    dfs[i] = dfs[i][dfs[i].Thickness>0]
    dfs[i] = dfs[i].reset_index(drop=True)
    
    dfs[i] = dfs[i][['CoreID','Citation','Method','Timestamp','Latitude','Longitude','Elevation',
                    'Midpoint','StartDepth','StopDepth','Thickness','Density']]

In [69]:
### TOO TEDIOUS ###

# # Find errors
# errors = []
# for i in range(len(dfs)):
    
#     dftemp = dfs[i]
    
#     dferror = dftemp[dftemp.Thickness<0]
    
#     if len(dferror) > 0:
#         errors.append(dferror)

In [55]:
### TOO TEDIOUS ###

# for i in range(len(dfs)):
    
#     dftemp = dfs[i]
    
#     start = np.array(dftemp.StartDepth)
#     stop = np.array(dftemp.StopDepth)
#     mid = np.array(dftemp.Midpoint)
    
#     if i == 137:
#         start[1],start[8],start[25] = 0.05,0.46,1.53
#         stop[0] = 0.05
#         mid[0],mid[1],mid[8],mid[25] = 0.275,0.07,0.49,1.56
        
#     if i == 142:
#         start[15] = 1.05
#         mid[15] = 1.1
    
#     if i == 155:
#         dfs[155] = dftemp[dftemp.Thickness>0]
#         dfs[155] = dfs[155].reset_index(drop=True)
    
#     if i == 185:
#         stop[7] = 6.638
#         mid[7] = 6.569
    
#     if i == 194:
#         dfs[194] = dftemp[dftemp.Thickness>0]
#         dfs[194] = dfs[194].reset_index(drop=True)
    
#     if i == 201:
#         stop[10],stop[51] = 8.232,13.457
#         mid[10],mid[51] = 8.1985,13.3535
    
#     if i == 204:
#         start[61] = 3.57
#         mid[61] = 3.6
        
#     if i == 207:
#         stop[46],stop[52] = 12.379,13.819
#         mid[46],mid[52] = 12.3025,13.7085
#         dftemp.StartDepth = start
#         dftemp.StopDepth = stop
#         dftemp.Midpoint = mid
#         dftemp.Thickness = dftemp.StopDepth - dftemp.StartDepth        
#         dfs[207] = dftemp[dftemp.Thickness>0]
#         dfs[207] = dfs[207].reset_index(drop=True)
    
#     dftemp.StartDepth = start
#     dftemp.StopDepth = stop
#     dftemp.Midpoint = mid
#     dftemp.Thickness = dftemp.StopDepth - dftemp.StartDepth