In [39]:
# Prepare input data
import ast
import pandas as pd
import numpy as np
from math import isnan
from collections import Counter

# set print options
np.set_printoptions(linewidth=110)
pd.set_option('display.width', 200)

# Import only first 1000 lines for testing
df = pd.read_csv('./data/sensor_readings_timeseries_part2.csv', nrows=500, parse_dates=["Timestamp"])
df = df.drop(
    ["Metadata.Location.type", "Metadata.SensorCommunitySensorType", "Metadata.SensorType", "_id"], axis=1)

df["Metadata.Location.coordinates"] = df["Metadata.Location.coordinates"].apply(lambda x: ast.literal_eval(x))

# no preprocessing needed
sensor_community_df = df.loc[df["Metadata.Provider"] == "sensor.community"].drop(["Metadata.Provider", "Metadata.NetatmoSensorId"], axis=1)

# need to combine multiple entries for the same sensor id and time
netatmo_df = df.loc[df["Metadata.Provider"] == "netatmo"].drop(
    ["Metadata.Provider"], axis=1)

# get unique groups
grouped = netatmo_df.groupby(["Metadata.NetatmoSensorId", "Timestamp"]).aggregate(list)

combined_rows = []
for label, group in grouped.iterrows():
    humidity = next((x for x in group.values[0] if not isnan(x)), np.NaN)
    pressure = next((x for x in group.values[2] if not isnan(x)), np.NaN)
    temperature = next((x for x in group.values[3] if not isnan(x)), np.NaN)


    # Humidity, Metadata.Location.coordinates, Pressure, Temperature, Timestamp
    entry = [humidity, group.values[1][0], pressure, temperature, label[1]]
    combined_rows.append(entry)

netatmo_df = pd.DataFrame(combined_rows, columns=["Humidity", "Metadata.Location.coordinates", "Pressure", "Temperature", "Timestamp"])

# Combine prepared data
prep = pd.concat([sensor_community_df,netatmo_df], ignore_index=True, axis=0)
prep = prep[prep["Temperature"].notna()]

print(prep.tail(5))



     Humidity Metadata.Location.coordinates  Pressure  Temperature                 Timestamp
421      87.0    [10.172421, 53.523059, 31]       NaN          4.1 2023-02-02 19:50:20+00:00
422      87.0    [10.172421, 53.523059, 31]       NaN          4.1 2023-02-02 19:55:19+00:00
423      88.0    [10.172421, 53.523059, 31]       NaN          4.0 2023-02-02 20:00:20+00:00
424      88.0    [10.172421, 53.523059, 31]       NaN          4.0 2023-02-02 20:05:19+00:00
425      88.0    [10.172421, 53.523059, 31]       NaN          4.1 2023-02-02 20:10:20+00:00


In [40]:
# Prepare training and test data
import itertools
from scipy.spatial.distance import squareform,pdist

# Extract unique locations.


def get_unique_lists_of_lists(list_of_lists):
    # Create an empty set to store unique lists
    unique_lists = set()

    # Iterate through the list of lists
    for lst in list_of_lists:
        # Convert each list to a tuple to make it hashable
        # and add it to the set
        unique_lists.add(tuple(lst))

    # Convert the set of tuples back to lists
    unique_lists = [list(lst) for lst in unique_lists]
    return unique_lists


def order_points_by_longitude_latitude(points):
    # Sort points by longitude and latitude using a lambda function
    ordered_points = sorted(points, key=lambda point: (point[1], point[0]))
    return ordered_points


# List of 3d points with long, lat, alt.
# Unique and ordered by longitude ascending
unique_points = order_points_by_longitude_latitude(get_unique_lists_of_lists(
    prep["Metadata.Location.coordinates"]))

# Convert list of points to numpy array
points = np.array(unique_points)

# Calculate pairwise Euclidean distance using pdist
distance_vector = pdist(points, 'euclidean')

# Convert distance vector to square distance matrix
distance_matrix = squareform(distance_vector)

# # Convert readings to distance matrix.
distance_set = pd.DataFrame(distance_matrix)

print(distance_set)


           0          1          2          3          4          5          6          7          8          9   ...         58         59         60         61         62         63         64  \
0    0.000000   3.707612   3.906633  15.303031  12.505876  39.301194  19.301144  36.400832  24.002069  26.903176  ...   6.417175   9.711389   9.014043   3.038729   0.555762   4.332609  24.210503   
1    3.707612   0.000000   0.200290  19.007641  16.211709  43.003318  15.605319  32.700765  27.700828  23.208235  ...   2.755946   6.022539   5.336482   0.903586   3.947793   0.875281  20.519419   
2    3.906633   0.200290   0.000000  19.207274  16.411177  43.203173  15.405107  32.500716  27.900790  23.008020  ...   2.558428   5.822605   5.136749   1.061305   4.144170   0.744580  20.319152   
3   15.303031  19.007641  19.207274   0.000000   2.801138  24.000051  34.600494  51.702039   8.717318  42.200431  ...  21.704979  25.005210  24.303808  18.305979  15.107708  19.605663  39.503075   
4   12.505

In [None]:
# Create test dataset
# For each entry find nearest neighbors in radius r (with distances?) and target variable of air temp
# [ [Features], ... ] [List of Distances to the target point] [List of target points]

In [None]:
from sklearn import neighbors

reg = neighbors.RadiusNeighborsRegressor(radius=5.0, weights='distance')

# Needs to be an array of arrays of each entry.
# X
# Needs to be a distance matrix
train_input = []

# Needs to be array of length of train_input with the target air temperature.
# y
train_target = []


In [43]:
# Create regression dataset
from sklearn import datasets

X, y = datasets.make_regression(n_samples=100,n_features=10)

print(X.shape, y.shape)
print(X)

print(y)

(100, 10) (100,)
[[ 3.40726535e-02  6.18531299e-01 -3.91792915e-01  2.85264370e-01 -6.55176691e-01 -6.78953988e-01
  -2.73389892e+00  8.42214098e-01 -4.02062771e-02 -9.68379073e-02]
 [ 3.27460990e-01  4.40478370e-03  4.51137786e-01  2.14096180e-01 -2.95745237e-01  2.02018252e-01
   2.52807747e-01  6.42494411e-01  2.29713648e-01  9.81233318e-01]
 [-4.28913266e-01  1.05097497e+00 -1.17691818e+00 -7.11861381e-01 -6.50641654e-01 -6.07068856e-01
   1.33914272e+00  6.08968785e-01 -5.31932437e-01  9.66851100e-02]
 [-1.27162297e+00 -1.54143924e+00 -4.87063731e-01 -3.50490730e-01  7.70692730e-01 -1.94194140e+00
  -2.35612213e-01 -7.01460361e-01 -3.08609613e-01  1.31733385e+00]
 [ 7.56004657e-01 -7.71256034e-02  3.18600850e-01 -3.38865500e-01  3.73473874e-01  1.46273997e-01
  -2.09387748e-01 -7.06406397e-02 -1.18358141e+00 -7.44605653e-02]
 [ 6.33032570e-01 -1.12803118e-01 -7.66955289e-01 -1.58219073e+00  1.71353532e+00 -4.85385517e-01
  -1.74641403e+00 -7.00589756e-01 -5.09377613e-01  7.2432718