# Filtering Based on Signal, Mass and Distance

We begin with a set of points which are common to all 3 datasets. These are found in ".../Cleaned_Real/..." style file paths. To these, we now hope to:
1. Work out the separation of the counterfactual from the real AIS point.
2. Append both the signal and mass measurements

These will then be written back to this same location. To these points, we can now play around with applying filters on albedo, signal, mass and distance to CFs
   

## Adding Distance From AIS Location

In [1]:
from math import radians, sin, cos, sqrt, atan2, acos
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from geopy.distance import distance
from pyproj import Geod
from scipy import stats
import os
import matplotlib.pyplot as plt
def pyproj_distance(point1, point2):
    geod = Geod(ellps="WGS84")  # Define the ellipsoid (e.g., WGS84)
    _, _, distance = geod.inv(point1[1], point1[0], point2[1], point2[0])
    return distance / 1000.0  # Convert meters to kilometers    

In [None]:
# We want to load in all the points, grouped by particle number. This should be the same for all of them.
#Selected_Real_Sets = []
#Selected_CF_Sets   = []

StartDate = datetime(2019, 1, 1, 0, 0)
EndDate   = datetime(2020, 1, 1, 0, 0)
while StartDate < EndDate:
    NumberOfSubsets = set()
    New_Upper = []
    New_Lower = []
    try:
        Link_Real  = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_Real/{}_{}_{}_{}:{}'.format(str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
        Link_Upper = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_UCF/{}_{}_{}_{}:{}'.format(str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
        Link_Lower = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_LCF/{}_{}_{}_{}:{}'.format(str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
        
        Real      = pd.read_csv(Link_Real)
        UCF_file  = pd.read_csv(Link_Upper)
        LCF_file  = pd.read_csv(Link_Lower) 
        
        Grouped_Real_Data = Real.groupby('particle')
        Grouped_UCounterFac_Data = UCF_file.groupby('particle')
        Grouped_LCounterFac_Data = LCF_file.groupby('particle')
        
        Real_subsets = [group for _, group in Grouped_Real_Data]
        UCounterfac_subsets = [group for _, group in Grouped_UCounterFac_Data]
        LCounterfac_subsets = [group for _, group in Grouped_LCounterFac_Data]
        
        for _, row in Real.iterrows():
            NumberOfSubsets.add(row['particle'])
        if "Visible_Albedo" in Real.columns:
            for i in range(len(NumberOfSubsets)):
                
                Real_subsets[i].sort_values(by = 'jday', inplace = True)
                UCounterfac_subsets[i].sort_values(by = 'jday', inplace = True)
                UCounterfac_subsets[i].sort_values(by = 'jday', inplace = True)
                
                subset_i_distances_U = []
                subset_i_distances_L = []
                for j in range(len(Real_subsets[i])):
                    TupleU = (UCounterfac_subsets[i]['lat'].iloc[j], UCounterfac_subsets[i]['lon'].iloc[j])
                    TupleL = (LCounterfac_subsets[i]['lat'].iloc[j], LCounterfac_subsets[i]['lon'].iloc[j])
                    TupleR = (Real_subsets[i]['lat'].iloc[j], Real_subsets[i]['lon'].iloc[j])
                    
                    CFU_dist = pyproj_distance(TupleU, TupleR)
                    CFL_dist = pyproj_distance(TupleL, TupleR)
                   
                    subset_i_distances_U.append(CFU_dist)
                    subset_i_distances_L.append(CFL_dist)
               # print(f'Upper length = {len(subset_i_distances_U)}')
               # print(f'Upper length = {len(subset_i_distances_L)}')
               # print(f'Subset length = {len(Real_subsets[i])}')
                try:
                    UCounterfac_subsets[i]['Distance_to_AIS'] =  subset_i_distances_U
                    LCounterfac_subsets[i]['Distance_to_AIS'] =  subset_i_distances_L 
                    New_Upper.append(UCounterfac_subsets[i])
                    New_Lower.append(LCounterfac_subsets[i])
                except Exception as err:
                    print(f'Apparently the problem is {err}, [Subset length differences = {len(Real_subsets[i]) - len(UCounterfac_subsets[i])} & {len(Real_subsets[i]) - len(LCounterfac_subsets[i])}], [Whereas our distance vectors are length {len(subset_i_distances_U)} and {len(subset_i_distances_L)}], [This occured for particle number {i} on date {StartDate}]')
                  #  print(f'Subset length differences = {len(Real_subsets[i]) - UCounterfac_subsets[i]} & {len(Real_subsets[i]) - LCounterfac_subsets[i]}')
                   # print(f'Whereas our distance vectors are length {len(subset_i_distances_U)} and {len(subset_i_distances_L)}')
                    print(f'This occured for particle number {i} on date {StartDate}')
                    null_vec = np.full(len(UCounterfac_subsets[i]), np.nan)
                    UCounterfac_subsets[i]['Distance_to_AIS'] =  null_vec
                    LCounterfac_subsets[i]['Distance_to_AIS'] =  null_vec
                    New_Upper.append(UCounterfac_subsets[i])
                    New_Lower.append(LCounterfac_subsets[i])
            Final_Upper = pd.concat(New_Upper, ignore_index = True)
            Final_Lower = pd.concat(New_Lower, ignore_index = True)
            Final_Upper.to_csv('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_UCF/{}_{}_{}_{}:{}'.format(str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2)))
            Final_Lower.to_csv('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_LCF/{}_{}_{}_{}:{}'.format(str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2)))
            StartDate += timedelta(hours = 1)
        else:
            print(f'not added albedo for {StartDate}')
            StartDate += timedelta(hours = 1)
    except Exception as e:
        print(f'Issue is {e}')
        StartDate += timedelta(hours = 1)

## Adding Signal and Mass From Original Files

The original files exist at "/gws/nopw/j04/eo_shared_data_vol2/scratch/pete_nut/emissions_tracked" links. We want to take these files, and do the following:
1. load in and match 'ais' location with each of its advected locations, or equivalently find the original location and find corresponding points to this
2. Append both 'signal' and 'mass' to each of these advected points

In [None]:
files = os.listdir("/gws/nopw/j04/eo_shared_data_vol2/scratch/pete_nut/emissions_tracked/2019")
for file in files[:10]:
    print(file)

In [None]:
relevant_particle = First_points[1][First_points[1]['particle'] == 339]
relevant_particle

In [None]:
advected_points[0]

In [5]:
date = datetime(2019, 1, 1, 0, 0)
while date < datetime(2020, 1, 1, 0, 0):
    try:    
        Link_original = "/gws/nopw/j04/eo_shared_data_vol2/scratch/pete_nut/emissions_tracked/{}/{}_{}".format(str(date.year), str(date.month).zfill(2), str(date.day).zfill(2))
        Link_advected  = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_Real/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), str(date.day).zfill(2),str(date.hour).zfill(2), str(date.minute).zfill(2))
        UCF_LINK = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_UCF/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), str(date.day).zfill(2),str(date.hour).zfill(2), str(date.minute).zfill(2))
        LCF_LINK = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_LCF/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), str(date.day).zfill(2),str(date.hour).zfill(2), str(date.minute).zfill(2))        
      
       
        File_original = pd.read_csv(Link_original)
        File_advected = pd.read_csv(Link_advected)
        UCF = pd.read_csv(UCF_LINK)
        LCF = pd.read_csv(LCF_LINK)
        
        Grouped_by_time     = File_original.groupby('hour')
        Grouped_by_particle = File_advected.groupby('particle')
        Grouped_ucf         = UCF.groupby('particle')
        Grouped_lcf         = LCF.groupby('particle')
        
        First_points    = [group for _, group in Grouped_by_time]
        advected_points = [group for _, group in Grouped_by_particle]
        UCF_group       = [group for _, group in Grouped_ucf]
        LCF_group       = [group for _, group in Grouped_lcf]
        
        if 'Visible_Albedo' in File_advected.columns:
            NumberOfParticles = set()
            for _, row in File_advected.iterrows():
                NumberOfParticles.add(row['particle'])
            New_List = []
            UCF_list = []
            LCF_list = []
            for i in range(len(NumberOfParticles)):
                particle = advected_points[i]['particle'].iloc[0] 
                relevant_particle   = First_points[date.hour][First_points[date.hour]['particle'] == particle]
                try:
                    massVec   = np.full(len(advected_points[i]), relevant_particle['mass'].iloc[0])
                    signalVec = np.full(len(advected_points[i]), relevant_particle['signal'].iloc[0])
                    
                    advected_points[i]['mass']   = massVec
                    advected_points[i]['signal'] = signalVec
                    UCF_group[i]['mass']         = massVec
                    UCF_group[i]['signal']       = signalVec
                    LCF_group[i]['mass']         = massVec
                    LCF_group[i]['signal']       = signalVec
                    
                    UCF_list.append(UCF_group[i])
                    LCF_list.append(LCF_group[i])
                    New_List.append(advected_points[i])
                except IndexError:
                    print('index error')
                    continue
            if New_List:
                Advected_with_emissions = pd.concat(New_List, ignore_index = True)
                UCF_new = pd.concat(UCF_list, ignore_index = True)
                LCF_new = pd.concat(LCF_list, ignore_index = True)
                Advected_with_emissions.to_csv('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Signal&Mass/Real/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), 
                                                                                                                     str(date.day).zfill(2),str(date.hour).zfill(2), 
                                                                                                                     str(date.minute).zfill(2)))
                UCF_new.to_csv('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Signal&Mass/UCF/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), 
                                                                                                                     str(date.day).zfill(2),str(date.hour).zfill(2), 
                                                                                                                     str(date.minute).zfill(2)))
                LCF_new.to_csv('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Signal&Mass/LCF/{}_{}_{}_{}:{}'.format(str(date.year), str(date.month).zfill(2), 
                                                                                                                     str(date.day).zfill(2),str(date.hour).zfill(2), 
                                                                                                                     str(date.minute).zfill(2)))
            date += timedelta(hours = 1)
            
        else:
            date += timedelta(hours = 1)
            print(f'Albedo missing for {date}')
    except Exception as e:
        date += timedelta(hours = 1)
        print(f'Skipped {date}, since {e}')


index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
index error
inde

In [4]:
linky = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_UCF/2019_02_02_00:00'
item = pd.read_csv(linky)
item.sort_values(by = 'illum', inplace = True)
item

Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,particle,jday,lat,...,cer_uncertainty,cwp,cwp_uncertainty,cth,cth_uncertainty,illum,cloud_type,Visible_Albedo,Albedo_Uncertainty,Distance_to_AIS
0,0,0,2896,2896,2896,86712,86712,325,2019-02-02 11:30:00,-0.784,...,1.78,15.0,4.0,1.84,0.18,1.0,3.0,0.1373,0.0106,24.141332
67,67,67,2370,2370,2370,69913,69913,428,2019-02-02 09:15:00,-17.507,...,1.57,22.0,4.0,0.91,0.09,1.0,3.0,0.2593,0.0197,22.862113
66,66,66,1925,1925,1925,56767,56767,428,2019-02-02 07:30:00,-17.816,...,0.81,27.0,8.0,1.37,0.12,1.0,3.0,0.6266,0.0338,23.440252
65,65,65,3351,3351,3351,99945,99945,412,2019-02-02 13:15:00,-15.081,...,0.77,47.0,6.0,1.73,0.18,1.0,3.0,0.3653,0.0246,24.719600
64,64,64,3231,3231,3231,96189,96189,412,2019-02-02 12:45:00,-15.150,...,0.71,53.0,7.0,1.65,0.18,1.0,3.0,0.3686,0.0230,24.912589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,7,7,1019,1019,1019,30398,30398,351,2019-02-02 04:00:00,-7.229,...,5.40,4.0,3.0,1.21,0.66,3.0,3.0,,,30.015319
37,37,37,4632,4632,4632,141223,141223,374,2019-02-02 18:45:00,-8.903,...,1.23,17.0,4.0,1.78,0.13,3.0,3.0,,,41.903711
70,70,70,487,487,487,13584,13584,439,2019-02-02 01:45:00,-20.061,...,1.76,10.0,5.0,1.29,0.20,3.0,3.0,,,27.058811
6,6,6,957,957,957,28520,28520,351,2019-02-02 03:45:00,-7.277,...,1.52,5.0,2.0,1.61,0.26,3.0,3.0,,,29.767830


In [30]:
linky = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Signal&Mass/UCF/2019_02_02_00:00'
item = pd.read_csv(linky)
item.sort_values(by = 'particle', inplace = True)
item

Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,particle,jday,...,cwp_uncertainty,cth,cth_uncertainty,illum,cloud_type,Visible_Albedo,Albedo_Uncertainty,Distance_to_AIS,mass,signal
0,0,0,0,2896,2896,2896,86712,86712,325,2019-02-02 11:30:00,...,4.0,1.84,0.18,1.0,3.0,0.1373,0.0106,24.141332,14.032698,2.505839
1,1,1,1,2596,2596,2596,77342,77342,345,2019-02-02 10:15:00,...,4.0,1.76,0.15,1.0,3.0,0.2126,0.0184,43.840223,14.032698,2.171727
2,2,2,2,2722,2722,2722,81098,81098,345,2019-02-02 10:45:00,...,2.0,1.78,0.19,1.0,3.0,0.2240,0.0186,44.041311,14.032698,2.171727
3,3,3,3,4227,4227,4227,128050,128050,347,2019-02-02 17:00:00,...,4.0,2.25,0.14,1.0,3.0,0.6022,0.0338,46.378059,13.698586,2.004671
4,4,4,4,4284,4284,4284,129928,129928,347,2019-02-02 17:15:00,...,7.0,2.29,0.17,1.0,3.0,0.7089,0.0391,46.551267,13.698586,2.004671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,79,85,85,2007,2007,2007,58675,58675,458,2019-02-02 07:45:00,...,4.0,1.03,0.18,1.0,3.0,0.3210,0.0185,62.743241,18.376152,2.672895
76,76,82,82,1813,1813,1813,53041,53041,458,2019-02-02 07:00:00,...,3.0,0.60,0.16,1.0,3.0,0.3647,0.0320,62.281532,18.376152,2.672895
77,77,83,83,1878,1878,1878,54919,54919,458,2019-02-02 07:15:00,...,5.0,0.65,0.12,1.0,3.0,0.3316,0.0197,62.355766,18.376152,2.672895
78,78,84,84,1943,1943,1943,56797,56797,458,2019-02-02 07:30:00,...,6.0,0.81,0.14,1.0,3.0,0.2798,0.0171,62.629802,18.376152,2.672895


In [24]:
items = os.listdir('/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Signal&Mass/Real')
for item in items:
    print(item)


2019_02_02_00:00
2019_02_02_01:00
2019_02_02_02:00
2019_02_02_03:00
2019_02_02_04:00
2019_02_02_05:00
2019_02_02_06:00
2019_02_02_07:00
2019_02_02_08:00
2019_02_02_09:00
2019_02_02_10:00
2019_02_02_11:00
2019_02_02_12:00
2019_02_02_13:00
2019_02_02_14:00
2019_02_02_15:00
2019_02_02_16:00
2019_02_02_17:00
2019_02_02_18:00
2019_02_02_19:00
2019_02_02_20:00
2019_02_02_21:00
2019_02_02_22:00
2019_02_02_23:00
2019_02_03_00:00
2019_02_03_01:00
2019_02_03_02:00
2019_02_03_03:00
2019_02_03_04:00
2019_02_03_05:00
2019_02_03_06:00
2019_02_03_07:00
2019_02_03_08:00
2019_02_03_09:00
2019_02_03_10:00
2019_02_03_11:00
2019_02_03_12:00
2019_02_03_13:00
2019_02_03_14:00
2019_02_03_15:00
2019_02_03_16:00
2019_02_03_17:00
2019_02_03_18:00
2019_02_03_19:00
2019_02_03_20:00
2019_02_03_21:00
2019_02_03_22:00
2019_02_03_23:00


In [92]:
Link_Real  = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_Real/2019_02_07_01:00'
old = pd.read_csv(Link_Real)
old.sort_values(by = 'particle', inplace = True)
old= old[old['particle'] == 339]
old

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,particle,jday,lat,lon,alt,...,cer,cer_uncertainty,cwp,cwp_uncertainty,cth,cth_uncertainty,illum,cloud_type,Visible_Albedo,Albedo_Uncertainty
38,1963,1963,1963,67379,67379,339,2019-02-07 10:45:00,-13.707,-15.334,11.9,...,10.349999,1.18,49.0,7.0,1.5,0.15,1.0,3.0,0.3938,0.0312
13,1405,1405,1405,48470,48470,339,2019-02-07 08:00:00,-14.128,-14.84,15.5,...,2.4,0.29,27.0,8.0,1.59,0.13,1.0,3.0,0.813,0.0426


In [121]:
StartDate += timedelta(hours = 18)
StartDate.hour

2

## Misc

In [126]:
StartDate = datetime(2019, 2, 4, 1, 0)
Link_Real  = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_Real/{}_{}_{}_{}:{}'.format(
                    str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),
                    str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
Link_Upper = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_UCF/{}_{}_{}_{}:{}'.format(
                    str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),
                    str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
Link_Lower = '/gws/nopw/j04/eo_shared_data_vol2/scratch/AO12/Cleaned_LCF/{}_{}_{}_{}:{}'.format(
                    str(StartDate.year), str(StartDate.month).zfill(2), str(StartDate.day).zfill(2),
                    str(StartDate.hour).zfill(2), str(StartDate.minute).zfill(2))
Real      = pd.read_csv(Link_Real)
UCF_file  = pd.read_csv(Link_Upper)
LCF_file  = pd.read_csv(Link_Lower) 

Real.sort_values(by = 'particle')
UCF_file.sort_values(by = 'particle')
LCF_file.sort_values(by = 'particle')

Grouped_Real_Data = Real.groupby('particle')
Grouped_UCounterFac_Data = UCF_file.groupby('particle')
Grouped_LCounterFac_Data = LCF_file.groupby('particle')

Real_subsets = [group for _, group in Grouped_Real_Data]
UCounterfac_subsets = [group for _, group in Grouped_UCounterFac_Data]
LCounterfac_subsets = [group for _, group in Grouped_LCounterFac_Data]

In [None]:
Real_subsets[13]

In [None]:
UCounterfac_subsets[10]

In [None]:
LCounterfac_subsets[9]

In [134]:
NumberOfSubsetsR = set()
NumberOfSubsetsU = set()
NumberOfSubsetsL = set()

for _, row in Real.iterrows():
    NumberOfSubsetsR.add(row['particle'])
    
for _, row in UCF_file.iterrows():
    NumberOfSubsetsR.add(row['particle'])
    
for _, row in LCF_file.iterrows():
    NumberOfSubsetsL.add(row['particle'])
    

In [135]:
len(NumberOfSubsetsR)

38

In [136]:
len(NumberOfSubsetsR)

38

In [137]:
len(NumberOfSubsetsL)

31

In [139]:
print(len(Real), len(UCF_file), len(LCF_file))

263 164 156


In [None]:
len(UCF_file)