In [1]:
import pandas as pd
import math
import numpy as np

STATIONS_BANDS = ["SO2","C6H6","NO2","O3","PM10","PM25","CO"]
R = 6373.0

def create_data(data_path: str, legend_path: str):
    data = {}
    data_path_df = pd.read_csv(data_path)
    legend_path_df = pd.read_csv(legend_path, sep=";")
    legend_path_df['Location'] = legend_path_df['Location'].str.split(', ')
    legend_dict = dict(zip(legend_path_df['id_amat'], legend_path_df['Location']))
    
    for _, row in data_path_df.iterrows():
        date = row['date']
        if date not in data:
            data[date] = {}
        latlon = legend_dict.get(row['station_id'])
        if latlon:
            latlon = latlon[1][:-1] + ' ' + latlon[0][1:]
            data[date][latlon] = dict(row[4:])
            
    return data
    
def get_closest_dist_per_band(data, date, latlon):
    """
    Given latlon of a pixel find for each pollutant the distance of the closest GoldenStation
    NPArray aligned with STATION_BANDS
    """
    data_single_date = data[date]
    distances = {}
    for band in STATIONS_BANDS:
        distances[band] = np.inf
    for i, band in enumerate(STATIONS_BANDS):
        for j, latlon_data in enumerate(list(data_single_date.keys())):
            latlon_data_list = latlon_data.split()
            if not np.isnan(data_single_date[latlon_data][band]):
                lat1 = math.radians(latlon[1])
                lat2 = math.radians(float(latlon_data_list[1]))
                lon1 = math.radians(latlon[0])
                lon2 = math.radians(float(latlon_data_list[0]))
                diff_lon = lon2 - lon1
                diff_lat = lat2 -lat1 
                a = (math.sin(diff_lat/2))**2 + math.cos(lon1) * math.cos(float(lat2)) * (math.sin(diff_lon/2))**2
                c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
                dist = R * c
                if band in distances.keys():
                    if dist < distances[band]:
                        distances[band] = dist
                else:
                    distances[band] = dist
    return list(distances.values())

In [2]:
data = create_data("/Users/luca/Downloads/weak_labels_merge/stations_data_test.csv", "/Users/luca/Downloads/weak_labels_merge/qaria_stazione.csv")

In [3]:
print(data['2023-01-02'])

{'9.18218994140625 45.432300567627': {'SO2': nan, 'C6H6': nan, 'NO2': nan, 'O3': nan, 'PM10': nan, 'PM25': nan, 'CO': nan}, '9.23478031158447 45.4740982055664': {'SO2': 2.5, 'C6H6': 2.7, 'NO2': 26.0, 'O3': 2.0, 'PM10': 57.0, 'PM25': 48.0, 'CO': nan}, '9.16944026947021 45.4441986083984': {'SO2': nan, 'C6H6': nan, 'NO2': 32.0, 'O3': nan, 'PM10': nan, 'PM25': nan, 'CO': 0.9}, '9.19083976745605 45.4962997436523': {'SO2': nan, 'C6H6': 2.3, 'NO2': 41.0, 'O3': nan, 'PM10': 66.0, 'PM25': 62.0, 'CO': 1.2}, '9.24730014801025 45.4995994567871': {'SO2': nan, 'C6H6': nan, 'NO2': nan, 'O3': nan, 'PM10': nan, 'PM25': nan, 'CO': nan}, '9.19791984558105 45.4705009460449': {'SO2': nan, 'C6H6': 1.9, 'NO2': 39.0, 'O3': nan, 'PM10': 58.0, 'PM25': 53.0, 'CO': 1.0}, '9.19534015655518 45.4635009765625': {'SO2': nan, 'C6H6': nan, 'NO2': 38.0, 'O3': 6.0, 'PM10': 47.0, 'PM25': nan, 'CO': nan}, '9.141770362854 45.4761009216309': {'SO2': nan, 'C6H6': nan, 'NO2': nan, 'O3': nan, 'PM10': nan, 'PM25': nan, 'CO': nan}

In [1]:
golden_stations_test_path = "/Users/luca/Downloads/weak_labels_merge/stations_data_test.csv" 
stations_test_path = "/Users/luca/Downloads/weak_labels_merge/weak_labels_test" 
stations_path = "/Users/luca/Downloads/weak_labels_merge/weak_labels_train" 
golden_stations_path = "/Users/luca/Downloads/weak_labels_merge/stations_data_train.csv" 
stations_legend_path = "/Users/luca/Downloads/weak_labels_merge/channels.json" 
golden_stations_legend_path = "/Users/luca/Downloads/weak_labels_merge/qaria_stazione.csv"
num_timesteps = 7
input_path = "full_day_data_fill_file_train.pt"

In [2]:
from utils import get_city_grids
from datasets.Stations import Stations
from utils import get_city_grids
from datasets.Stations import Stations
from PixelTimeseriesLabeled import PixelTimeSeriesLabeled
import parser

st = Stations(dataset_folder = stations_path, legend_folder = stations_legend_path, gold_data_path = golden_stations_path, gold_legend_path = golden_stations_legend_path)
grid = get_city_grids((9.13545, 45.50661, 9.25363, 45.42661))
test_dataset = PixelTimeSeriesLabeled(stations = st, num_timesteps=num_timesteps, input_data_path = input_path)

In [3]:
row_ix = 3
col_ix = 5
latlon = test_dataset.data[2][0, row_ix, col_ix, :]
date = test_dataset.data[3][7, row_ix, col_ix]
label , loss_factor = test_dataset.stations.get_item_temporal_aligned(7, row_ix , col_ix, date, latlon)

** [inf, 6.2825975073766385, 0.2320748693749214, 3.2780410320687183, 3.2780410320687183, 4.0198305794786595, 0.2320748693749214] [nan, 1.2, 72.0, 110.0, 30.0, 17.0, 1.1]
7 7 2 7
7 7 6 7


In [4]:
loss_factor

array([0.3       , 0.5697693 , 1.        , 0.77552057, 0.77552057,
       0.72472301, 1.        ])

In [6]:
get_closest_dist_per_band(data, '2023-01-02', [9.18218994140625, 45.432300567627])

[6.73066413923671,
 4.491485185426833,
 1.7732369746935146,
 3.677650659290339,
 3.677650659290339,
 4.491485185426833,
 1.7732369746935146]

In [7]:
# def get_loss_factor(date, latlon):
#     closest_dist_per_band = get_closest_dist_per_band(data, date, latlon)
#     loss_factors = np.ndarray(len(STATIONS_BANDS))
#     for i in range(len(closest_dist_per_band)):
#         if np.isclose( closest_dist_per_band[i], 0, atol=0.4 ):
#             loss_factors[i] = 1
#         else:
#             loss_factors[i] = 0.3

#     return loss_factors

# get_loss_factor('2023-01-02', [9.18218994140625, 45.432300567627])

In [8]:
grid = get_city_grids((9.13545, 45.50661, 9.25363, 45.42661))

In [28]:
distances = []
loss_factors = []
for i in range(grid.shape[0]):
    for j in range(grid.shape[1]):
        distances.append(get_closest_dist_per_band(data, '2023-01-12', grid[i][j]))
        loss_factors.append(st.get_loss_factor('2023-01-12', grid[i][j]))

In [29]:
max(distances)

[10.221349515331132,
 7.1796353457237565,
 3.3220920152597535,
 6.510086111181979,
 6.510086111181979,
 7.1796353457237565,
 3.3220920152597535]

In [20]:
test_dataset[11]

[]


(tensor([[ 3.0095e+02,  2.6934e+02,  3.0401e+02,  2.6910e+02,  2.6281e+02,
           1.9034e+03,  3.6172e-02,  1.3991e-04,  1.1471e-04,  1.6723e-01,
           6.5082e-04,  8.5031e+04,  9.5556e+04, -1.0853e+00, -1.0066e+00,
          -4.9897e-01,  5.5356e-01, -2.2704e-01,  1.2184e-01,  2.7934e+02,
           2.8486e+02,  1.3217e+03, -5.5229e-05,  4.7308e-01,  1.0192e+05,
           4.3945e-01,  2.8389e+02,  8.8000e-01,  0.0000e+00, -2.4556e-10,
           0.0000e+00,  4.0880e+05, -1.9536e+05,  9.9989e+04,  2.8285e-06,
           1.3625e+02,  6.8507e-01,  1.4286e-01,  7.3229e-02,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  3.2013e-02,  6.6827e-02,  0.0000e+00,
           0.0000e+00],
         [ 3.0194e+02,  2.8657e+02,  3.0017e+02,  2.8620e+02,  2.8445e+02,
           1.9034e+03,  3.4189e-02,  1.3991e-04,  1.1471e-04,  1.4619e-01,
           1.4737e-04,  6.5146e+04,  7.3367e+04, -4.8434e-01, -3.9062e-01,
          -2.0149e+00,  8.3277e-01, -9.5966e-01, -2.2210e-03,  2.8054e+02,
 

In [11]:
s = {}
for i in range(len(loss_factors)):
    print(loss_factors[i])
    for _ in loss_factors[i][0]:
        if _ in s.keys():
            s[_] += 1
        else:
            s[_] = 1

(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0, 47.0, 53.0, 0.9])
(array([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]), [2.5, 1.9, 32.0, 6.0

In [12]:
s

{0.3: 2242, 1.0: 26}

In [34]:
def scale_value(distance):
    # Ensure distance is within the range [0, 10]
    distance = min(max(distance, 0), 10)
    
    # Scale the distance to the range [0.3, 1]
    scaled_value = 1 - (distance / 10) * 0.7  # Linear scaling
    
    return scaled_value

# Test with different distances
distance_near_0 = 0.38769159659895186

distance_near_10 =  3.677650659290339

scaled_near_0 = scale_value(distance_near_0)
scaled_near_10 = scale_value(distance_near_10)

print(f"Scaled value for distance near 0: {scaled_near_0}")
print(f"Scaled value for distance near 10: {scaled_near_10}")


Scaled value for distance near 0: 0.9728615882380733
Scaled value for distance near 10: 0.7425644538496763
