In [43]:
## first time series test

# import external packages
import numpy as np
import pandas as pd
import numba
from numba import vectorize
import glob # for file search
import copy
import os # operating system stuff
import re # regex
import fastparquet # fast read/write for large data structures
import sklearn.preprocessing as pre # for data normalisation
from sklearn.metrics import pairwise_distances

import geopandas as gpd
import rasterio as rio
import rasterio.mask
from rasterio.plot import plotting_extent
from shapely.geometry import Polygon
from shapely.geometry.point import Point
import pyproj
from pyproj import CRS
from inpoly import inpoly2 # for fast inpolygon checks
import utm

import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
from matplotlib import cm as mpl_cm
from matplotlib import colors as mcolors 
import matplotlib.image as mplimg

from mpl_toolkits.axes_grid1 import make_axes_locatable # for colorbar scaling
from mpl_toolkits.axes_grid1 import ImageGrid
from matplotlib_scalebar.scalebar import ScaleBar
from matplotlib.gridspec import GridSpec
from matplotlib.ticker import FormatStrFormatter

import seaborn as sns
from matplotlib import rc_file_defaults
rc_file_defaults()
# sns.set(style=None, color_codes=True)

from shapely.geometry import Polygon
from shapely.geometry.point import Point
import datetime

import configparser

from cmcrameri import cm # for scientific colourmaps

###########################
# import main local package
import SPOTSAR_main as sm


In [55]:
import h5py

def extract_date_from_filename(filename):
    match = re.search(r'c\d{8}_c\d{8}', filename)
    if match:
        return match.group()
    else:
        return None

def process_hdf5_file(file_path,pairname):
    with h5py.File(file_path, 'r') as h5_file:
        longitude = h5_file['win_13_Lon'][:]
        latitude = h5_file['win_13_lat'][:]
        range_offset = h5_file['win_13_R_off'][:]
        azimuth_offset = h5_file['win_13_A_off'][:]

    # Convert 2D arrays to 1D by flattening
    # fig, ax = plt.subplots(1,1)
    # ax.scatter(longitude,latitude)
    longitude_arr = longitude.ravel()
    latitude_arr = latitude.ravel()
    # fig, ax = plt.subplots(1,1)
    # ax.scatter(longitude_arr,latitude_arr)

    range_offset_arr = range_offset.ravel()
    azimuth_offset_arr = azimuth_offset.ravel()

    return pd.DataFrame({
        'Longitude': longitude_arr,
        'Latitude': latitude_arr,
        f'Range_offset_{pairname}': range_offset_arr,
        f'Azimuth_offset_{pairname}': azimuth_offset_arr
    }) 


input_folder = '/Users/markbemelmans/Documents/PhD/projects/Merapi2021/CSK/dsc1/h5_files/'
output_csv = '/Users/markbemelmans/Documents/PhD/projects/Merapi2021/CSK/dsc1/h5_file_ts_win13_period1.csv'

file_list = sorted(os.listdir(input_folder))
file_list.sort(key=extract_date_from_filename)

dfs = []
columns = ['Longitude', 'Latitude']
for file_name in file_list:
    file_path = os.path.join(input_folder, file_name)
    pairname = extract_date_from_filename(file_name)
    
    df = process_hdf5_file(file_path,pairname)
    df_no_na = df.dropna()
    df_no_dupl = df_no_na.drop_duplicates(subset=['Longitude','Latitude'], keep='first')
    print(np.shape(df_no_dupl),np.shape(df))
    dfs.append(df_no_dupl)



(316409, 4) (343791, 4)
(309340, 4) (338548, 4)
(309720, 4) (338548, 4)
(304656, 4) (336301, 4)
(304406, 4) (336301, 4)
(303716, 4) (336301, 4)
(298662, 4) (336301, 4)
(299485, 4) (336301, 4)
(296570, 4) (336301, 4)
(293934, 4) (336301, 4)
(292274, 4) (336301, 4)
(292322, 4) (335552, 4)
(278735, 4) (324317, 4)
(287382, 4) (336301, 4)
(288846, 4) (336301, 4)
(287646, 4) (336301, 4)
(283236, 4) (336301, 4)
(269121, 4) (325066, 4)
(262359, 4) (324317, 4)
(276508, 4) (336301, 4)
(281852, 4) (336301, 4)
(277802, 4) (332556, 4)
(283222, 4) (337050, 4)
(280928, 4) (336301, 4)
(280237, 4) (336301, 4)
(281738, 4) (336301, 4)
(278983, 4) (336301, 4)
(280094, 4) (336301, 4)
(279413, 4) (336301, 4)
(279276, 4) (336301, 4)
(267286, 4) (325066, 4)
(279065, 4) (336301, 4)
(278261, 4) (336301, 4)
(279138, 4) (336301, 4)
(279291, 4) (336301, 4)
(277501, 4) (335552, 4)
(310200, 4) (337799, 4)
(312716, 4) (338548, 4)
(307507, 4) (337050, 4)
(305636, 4) (337050, 4)
(319037, 4) (343791, 4)
(307906, 4) (336

In [56]:
%matplotlib osx
plt.close('all')

idx = 8

for idx in range(1):
    idx = idx+55
    df1 = dfs[idx]
    fig, ax = plt.subplots(1,1)

    ax.hexbin(df1.Longitude,df1.Latitude,df1.iloc[:,2],gridsize=1000,cmap=cm.vik,vmin=-0.15, vmax=0.15)
    ax.set_title(file_list[idx])


In [33]:
for i, file in enumerate(file_list):
    print(i,file)   

0 c20200910_c20200919.h5
1 c20200910_c20200926.h5
2 c20200910_c20200927.h5
3 c20200910_c20201005.h5
4 c20200910_c20201012.h5
5 c20200910_c20201013.h5
6 c20200910_c20201113.h5
7 c20200910_c20201114.h5
8 c20200910_c20201122.h5
9 c20200910_c20201224.h5
10 c20200910_c20201231.h5
11 c20200910_c20210101.h5
12 c20200910_c20210109.h5
13 c20200910_c20210116.h5
14 c20200910_c20210117.h5
15 c20200910_c20210125.h5
16 c20200910_c20210201.h5
17 c20200910_c20210202.h5
18 c20200910_c20210217.h5
19 c20200910_c20210218.h5
20 c20200910_c20210302.h5
21 c20200910_c20210305.h5
22 c20200910_c20210306.h5
23 c20200910_c20210314.h5
24 c20200910_c20210321.h5
25 c20200910_c20210322.h5
26 c20200910_c20210330.h5
27 c20200910_c20210406.h5
28 c20200910_c20210419.h5
29 c20200910_c20210422.h5
30 c20200910_c20210423.h5
31 c20200910_c20210501.h5
32 c20200910_c20210508.h5
33 c20200910_c20210509.h5
34 c20200910_c20210521.h5
35 c20200910_c20210609.h5
36 c20200919_c20200926.h5
37 c20200919_c20200927.h5
38 c20200919_c20201005

In [57]:
# select dfs for time series

sel_dfs = dfs[0:3]
sel_dfs2 = dfs[36:]
print(np.shape(sel_dfs2))

sel_df = []
for df in sel_dfs:
    sel_df.append(df)

for df in sel_dfs2:
    sel_df.append(df)

print(np.shape(sel_df))


(157,)
(160,)


  result = asarray(a).shape


In [58]:

# test how many matches there are between dfs
%matplotlib osx

print(np.shape(pd.unique(sel_df[0]['Longitude'])))
print(np.shape(pd.unique(sel_df[0]['Latitude'])))
fig, ax = plt.subplots(1,1)


comb_df = pd.merge(sel_df[0], sel_df[1], on=['Longitude','Latitude'],how='outer')
comb_df = comb_df.dropna()
for i in range(2,160):
    print(i)
    comb_df = pd.merge(comb_df, sel_df[i], on=['Longitude','Latitude'],how='outer')
    # comb_df = comb_df.dropna()
    print(np.shape(comb_df))
    ax.scatter(i,np.shape(comb_df)[0])
    # print(np.sum(comb_df.isna()))
print(np.shape(comb_df))

# small_comb_df = comb_df[::25]
comb_df.to_csv(output_csv, index=False)

(8719,)
(117585,)
2
(310812, 8)
3
(312716, 10)
4
(313837, 12)
5
(313969, 14)
6
(314017, 16)
7
(320003, 18)
8
(320047, 20)
9
(320064, 22)
10
(320114, 24)
11
(320135, 26)
12
(320156, 28)
13
(320193, 30)
14
(320209, 32)
15
(320218, 34)
16
(320328, 36)
17
(320331, 38)
18
(320336, 40)
19
(320339, 42)
20
(320359, 44)
21
(320366, 46)
22
(320571, 48)
23
(320586, 50)
24
(320595, 52)
25
(320633, 54)
26
(320657, 56)
27
(320661, 58)
28
(320690, 60)
29
(320701, 62)
30
(320711, 64)
31
(320782, 66)
32
(320833, 68)
33
(320858, 70)
34
(321529, 72)
35
(321547, 74)
36
(321572, 76)
37
(321587, 78)
38
(321620, 80)
39
(321629, 82)
40
(321696, 84)
41
(321746, 86)
42
(321749, 88)
43
(321996, 90)
44
(321998, 92)
45
(322000, 94)
46
(322015, 96)
47
(322026, 98)
48
(322042, 100)
49
(322047, 102)
50
(322057, 104)
51
(322059, 106)
52
(322081, 108)
53
(322081, 110)
54
(322082, 112)
55
(322091, 114)
56
(322093, 116)
57
(322096, 118)
58
(322103, 120)
59
(322108, 122)
60
(322135, 124)
61
(322140, 126)
62
(322140, 128)


: 

In [249]:
fig, ax = plt.subplots(1,1)
ax.scatter(dfs[0]['Longitude'],dfs[0]['Latitude'])
ax.scatter(dfs[5]['Longitude'],dfs[5]['Latitude'])
ax.scatter(comb_df['Longitude'],comb_df['Latitude'])

<matplotlib.collections.PathCollection at 0x7fd16e2fa3a0>

In [146]:
test_df = dfs[0]
test_df = test_df[test_df.duplicated(['Longitude','Latitude'], keep=False)]
print(np.shape(test_df))
print(test_df[10:20])
# print(np.size(test_df==True))

(50, 4)
       Longitude  Latitude  Range_offset_x20201116_x20201127  \
1479  110.468430 -7.558755                           -0.0440   
1572  110.461197 -7.542143                            0.0560   
1576  110.461601 -7.542523                           -0.0170   
1776  110.468430 -7.558755                           -0.0140   
1869  110.461197 -7.542143                            0.0850   
1873  110.461601 -7.542523                           -0.0360   
3217  110.464180 -7.554617                            0.0795   
3811  110.464180 -7.554617                            0.0940   
4071  110.463280 -7.551690                            0.0215   
4665  110.463280 -7.551690                           -0.0280   

      Azimuth_offset_x20201116_x20201127  
1479                             -0.0150  
1572                             -0.1165  
1576                             -0.0025  
1776                              0.3030  
1869                             -0.0690  
1873                         

In [149]:
print(np.shape(comb_df.dropna()))

(113019, 6)


      Longitude  Latitude  Range_offset_x20201116_x20201127  \
100  110.462173 -7.543308                            0.0390   
101  110.462311 -7.543409                            0.0445   
102  110.462349 -7.543492                            0.0180   
103  110.462341 -7.543565                            0.0020   
104  110.462357 -7.543643                           -0.0190   
105  110.462372 -7.543719                           -0.0030   
106  110.462410 -7.543802                            0.0055   
107  110.462357 -7.543865                           -0.0030   
108  110.462082 -7.543883                            0.0280   
109  110.462029 -7.543946                            0.0240   
110  110.461983 -7.544010                            0.0410   
111  110.461739 -7.544034                            0.0660   
112  110.461754 -7.544110                            0.0310   
113  110.461853 -7.544205                           -0.0125   
114  110.462036 -7.544317                           -0.

In [54]:
%matplotlib osx
import pandas as pd

# Sample dataframes
df1 = dfs[0][0:4000]
df2 = dfs[1][0:4000]
# df2 = pd.DataFrame({'A': [1, 2], 'B': [3, 5], 'C': [11, 12]})
# df3 = pd.DataFrame({'A': [13, 14], 'B': [15, 16], 'C': [17, 18]})

fig, ax = plt.subplots(1,1)

ax.scatter(df1['Longitude'],df1['Latitude'],s=20)
# ax.scatter(df2['Longitude'],df2['Latitude'],s=5)

# Merge dataframes on columns 'A' and 'B'
# result_df = pd.merge(df1, df2, on=['A', 'B']).merge(df3, on=['A', 'B'])

result_df = df1.merge(df2,on=['Longitude','Latitude'], how='outer')

# Rename columns to C1, C2, C3, etc.
result_df.columns = ['A', 'B', 'r_off1', 'a_off1','r_off2','a_off2']


# print(df1)
# print(df2)
print(result_df[5540:5560])

               A         B  r_off1  a_off1  r_off2  a_off2
5540  110.465759 -7.557611 -0.0255 -0.0540 -0.1090  0.0540
5541  110.465828 -7.557700 -0.0510 -0.1075 -0.0610  0.0480
5542  110.466003 -7.557811 -0.0350 -0.1070 -0.0545  0.1180
5543  110.466080 -7.557901 -0.0760 -0.1920 -0.0470  0.1450
5544  110.466095 -7.557976 -0.0470 -0.2210 -0.0070  0.1425
5545  110.466118 -7.558057 -0.0660 -0.2320  0.0070  0.1585
5546  110.466232 -7.558154 -0.0800 -0.2570  0.0150  0.1685
5547  110.466225 -7.558228 -0.0210 -0.2675  0.0190  0.1740
5548  110.466293 -7.558314  0.0320 -0.0060  0.0280  0.2140
5549  110.466301 -7.558390  0.0420 -0.0170  0.0560  0.2250
5550  110.466377 -7.558480  0.0415 -0.0230 -0.0080  0.2235
5551  110.466492 -7.558579 -0.0010  0.2600 -0.0370  0.2520
5552  110.466583 -7.558671  0.0025  0.3315 -0.0365  0.2480
5553  110.466728 -7.558775 -0.0130  0.2560 -0.0350  0.2360
5554  110.463493 -7.536173 -0.0040 -0.0430  0.0420 -0.0370
5555  110.463379 -7.536225  0.0260 -0.0420  0.0480 -0.05