In [1]:
from geopy.distance import geodesic
import numpy as np
import pandas as pd
import pyowm
import utils
import time
from urllib3.response import ReadTimeoutError

## Import sensor statistics data

In [2]:
sensor_types = ["bme280", "bmp180", "dht22", "ds18b20", "hpm", "htu21d", "pms3003", "pms5003", "pms7003", "ppd42ns", "sds011"]

total_sensors = pd.DataFrame(columns=["sensorId", "location", "lat", "lon"])
for sensor_type in sensor_types:
    current_df = pd.read_csv(f"../data/processed/statistics/{sensor_type}.csv", sep=";", names=["sensorId", "sensorType", "location", "lat", "lon", "minTimestamp", "maxTimestamp", "readingCount"])
    total_sensors = total_sensors.append(current_df[["sensorId", "sensorType", "location", "lat", "lon"]])
total_sensors.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(18524, 5)

In [3]:
total_sensors  = total_sensors.dropna()

In [4]:
total_sensors.head()

Unnamed: 0,lat,location,lon,sensorId,sensorType
0,-33.513,7505,-70.764,14828,BME280
1,-33.421,8877,-70.607,17519,BME280
2,-33.389,9929,-70.617,19565,BME280
3,19.103,9861,72.846,19445,BME280
4,40.294,10051,-3.754,19783,BME280


## Filter out sensors far from Berlin

In [5]:
MAX_DISTANCE = 50
def filter_sensors(sensor):
    sensor_tuple = (sensor["lat"], sensor["lon"]) #(lat, lon)
    berlin_tuple = (52.520008,  13.404954)
    return geodesic(sensor_tuple, berlin_tuple).km < MAX_DISTANCE
total_sensors = total_sensors[total_sensors.apply(filter_sensors, axis=1)]

In [6]:
total_sensors.shape

(684, 5)

## Find closest weather station

In [7]:
credentials_1 = pyowm.OWM('48dae982f9e685eee268e90dafba5041')

In [8]:
closest_weather_stations=[]
for index, sensor in total_sensors.iterrows():
    time.sleep(1)
    for attempt in range(10):
        try:
            stations = utils.get_closest_weather_station(sensor["lat"],sensor["lon"],credentials_1)
            break
        except Exception:
            print(f"ReadTimeoutError, retry #{attempt + 1}")
            time.sleep(30)
    if stations:
        location = stations[0].get_location()
        closest_weather_stations.append([sensor["sensorId"], sensor["sensorType"],sensor["lon"],sensor["lat"],sensor["location"],location.get_name(),location.get_lon(),location.get_lat()])

ReadTimeoutError, retry #1
ReadTimeoutError, retry #1
ReadTimeoutError, retry #1
ReadTimeoutError, retry #1


In [9]:
len(closest_weather_stations)

684

In [10]:
unique_closest_weather_stations=np.unique(np.asarray(closest_weather_stations),axis=0)

In [11]:
unique_closest_weather_stations

array([['10063', 'SDS011', '13.135', ..., 'Bergholz', '13.1', '52.35'],
       ['10064', 'BME280', '13.135', ..., 'Bergholz', '13.1', '52.35'],
       ['10156', 'SDS011', '13.622', ..., 'Börnicke', '13.638',
        '52.6629'],
       ...,
       ['9810', 'DHT22', '13.193', ..., 'Karolinenhöhe', '13.1667',
        '52.5167'],
       ['9876', 'SDS011', '12.985', ..., 'Golm', '12.967', '52.4062'],
       ['9877', 'DHT22', '12.985', ..., 'Golm', '12.967', '52.4062']],
      dtype='<U26')

In [12]:
berlin_enrichable_sensors = pd.DataFrame(unique_closest_weather_stations, columns=["sensorId", "sensorType", "lon", "lat", "location", "stationName", "stationLon", "stationLat"])

In [13]:
np.savetxt("../data/unique_closest_weather_stations.csv", unique_closest_weather_stations, delimiter=",",fmt='%s')

In [14]:
weather = pd.read_csv("../data/raw/weather_data.csv", sep=";")

In [15]:
unique_locations = weather["location"].unique()

In [16]:
def filter_for_weather_stations(sensor):
    return sensor["stationName"] in unique_locations

In [17]:
berlin_enrichable_sensors.shape

(684, 8)

In [18]:
berlin_enrichable_sensors.head(20)

Unnamed: 0,sensorId,sensorType,lon,lat,location,stationName,stationLon,stationLat
0,10063,SDS011,13.135,52.359,5071,Bergholz,13.1,52.35
1,10064,BME280,13.135,52.359,5071,Bergholz,13.1,52.35
2,10156,SDS011,13.622,52.67100000000001,5119,Börnicke,13.638,52.6629
3,10162,SDS011,13.384,52.556,5123,Berlin Pankow,13.4019,52.5693
4,10163,DHT22,13.384,52.556,5123,Berlin Pankow,13.4019,52.5693
5,10168,SDS011,13.447,52.528,5126,Berlin,13.4105,52.5244
6,10169,DHT22,13.447,52.528,5126,Berlin,13.4105,52.5244
7,10345,SDS011,13.42,52.513000000000005,5219,Land Berlin,13.4167,52.5
8,10385,SDS011,13.359000000000002,52.482,5239,Berlin Schoeneberg,13.3484,52.4746
9,10386,DHT22,13.359000000000002,52.482,5239,Berlin Schoeneberg,13.3484,52.4746


In [19]:
berlin_enrichable_sensors = berlin_enrichable_sensors[berlin_enrichable_sensors.apply(filter_for_weather_stations, axis=1)]
berlin_enrichable_sensors.to_csv("../data/intermediate/berlin_enrichable_sensors.csv", sep=",", index=False)