# Weather data processing

In [None]:
# import libraries
## system libraries
import os
import re
import sys
import json
import time
import math
import bisect
from glob import glob
from pprint import pprint
from datetime import date, datetime
from collections import OrderedDict

## numerical libraries
import numpy as np
import pandas as pd
import pandas_profiling
from pandas.io.json import json_normalize

## plotting libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

# geo libraries
import geopandas as gpd

## module to read GeoTIF files
import rasterio
import rasterio.features
import rasterio.warp

## special libraries
from bson import json_util

# reports
from docx import Document
from docx.shared import Inches

import warnings

warnings.filterwarnings("ignore")

In [None]:
# helper functions

flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
# notebool options
%matplotlib inline

pd.set_option("display.max_columns", 500)
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16

In [None]:
# global variables
CUTTING_DATE = "2019-05-01"  # remove trips and data published before this date

data_campaigns_path = os.path.join("../..", "data-campaigns/")
meta_data_path = os.path.join(data_campaigns_path, "meta-data/")
shape_data = os.path.join(data_campaigns_path, "shapefiles/")
gps_data = os.path.join(data_campaigns_path, "2020-01-15.GPS/")
net_radition_data_path = os.path.join(data_campaigns_path, "net-radiation/")

# preprocessed data
input_path = os.path.join("../..", "2019-12-16.out/")
out_path = os.path.join("../..", "2019-12-16.out/")

# raw input data
raw_data_path = os.path.join(data_campaigns_path, "2019-10-30.all/")
raw_data_update_path = os.path.join(data_campaigns_path, "2019-12-16.update/")

# weather data
weather_dataset_path1 = os.path.join("../../data-campaigns/2019-10-25.weather")
weather_dataset_path2 = os.path.join("../../data-campaigns/2019-12-18.weather")

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"

# read datasets
legs_df = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + trips_users)
trips_df = pd.read_pickle(input_path + trips)

In [None]:
legs_df.head(3)

In [None]:
trips_users_df.head(3)

In [None]:
trips_df.head(3)

## Read weather data

In [None]:
weather_data_bson_filename1 = "weather.bson"
weather_data_filename1 = "weather.json"
weather_data_bson_path1 = os.path.join(
    weather_dataset_path1, weather_data_bson_filename1
)
weather_data_path1 = os.path.join(weather_dataset_path1, weather_data_filename1)

print(weather_data_path1)

In [None]:
weather_data_bson_filename2 = "weather.repr.bson"
weather_data_filename2 = "weather.json"
weather_data_bson_path2 = os.path.join(
    weather_dataset_path2, weather_data_bson_filename2
)
weather_data_path2 = os.path.join(weather_dataset_path2, weather_data_filename2)

print(weather_data_bson_path2)
print(weather_data_path2)

In [None]:
def read_bson_file(filename):
    with open(filename, "r") as f:
        bsondata = "[" + f.read() + "]"

        data = json.loads(bsondata, object_hook=json_util.object_hook)

        return json.loads(json_util.dumps(data))

In [None]:
# adapted from:
# How can I use Python to transform MongoDB's bsondump into JSON?
# https://stackoverflow.com/a/11886476/2377454
def read_bson_repr_file(filename):
    with open(filename, "r") as f:
        # read the entire input; in a real application,
        # you would want to read a chunk at a time
        bsondata = "[" + f.read() + "]"

        # convert the TenGen JSON to Strict JSON
        # here, I just convert the ObjectId and Date structures,
        # but it's easy to extend to cover all structures listed at
        # http://www.mongodb.org/display/DOCS/Mongo+Extended+JSON
        jsondata = re.sub(
            r"ObjectId\s*\(\s*\"(\S+)\"\s*\)", r'{"$oid": "\1"}', bsondata
        )
        jsondata = re.sub(r"Date\s*\(\s*(\S+)\s*\)", r'{"$date": \1}', jsondata)
        jsondata = re.sub(
            r"NumberInt\s*\(\s*(\S+)\s*\)", r'{"$numberInt": "\1"}', jsondata
        )

        # now we can parse this as JSON, and use MongoDB's object_hook
        # function to get rich Python data structures inside a dictionary
        data = json.loads(jsondata, object_hook=json_util.object_hook)

        return json.loads(json_util.dumps(data))

In [None]:
SAVE_CLEAN_DATASET = False

In [None]:
if SAVE_CLEAN_DATASET:
    weather_data1 = read_bson_file(weather_data_bson_path1)
    weather_clean_data1 = "weather_clean.json"
    weather_clean_data_path1 = os.path.join(weather_dataset_path1, weather_clean_data1)

    with open(weather_clean_data_path1, "w+") as outfp:
        json.dump(weather_data1, outfp)

In [None]:
if SAVE_CLEAN_DATASET:
    weather_data2 = read_bson_repr_file(weather_data_bson_path2)

    weather_clean_data2 = "weather_clean.json"
    weather_clean_data_path2 = os.path.join(weather_dataset_path2, weather_clean_data2)

    with open(weather_clean_data_path2, "w+") as outfp:
        json.dump(weather_data2, outfp)

In [None]:
print("weather data 1:", weather_data_path1)
print("weather data 2:", weather_data_path2)

In [None]:
weather_df1 = pd.read_json(weather_data_path1, orient="records")
weather_df1.head(10)

print("weather_df1: {}".format(weather_df1.shape[0]))

In [None]:
weather_df2 = pd.read_json(weather_data_path2, orient="records")
weather_df2.head(10)

print("weather_df2: {}".format(weather_df2.shape[0]))

In [None]:
weather_all_df = pd.concat([weather_df1, weather_df2])
print("weather_all_df: {}".format(weather_all_df.shape[0]))

In [None]:
weather_all_df.head(3)

In [None]:
pprint(list(weather_all_df.columns))

In [None]:
weather_parsed_df = weather_all_df[["city", "requestTimestamp"]].copy()

# create colum id
weather_parsed_df["id"] = weather_all_df["_id"].apply(lambda x: x.get("$oid", None))

weather_parsed_df["weather.dt"] = weather_all_df["weather"].apply(
    lambda x: x.get("dt", None)
)

# weather main
weather_parsed_df["weather.main.temp"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("temp", None)
)
weather_parsed_df["weather.main.temp_min"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("temp_min", None)
)
weather_parsed_df["weather.main.temp_max"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("temp_max", None)
)
weather_parsed_df["weather.main.pressure"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("pressure", None)
)
weather_parsed_df["weather.main.sea_level"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("sea_level", None)
)
weather_parsed_df["weather.main.grnd_level"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("grnd_level", None)
)
weather_parsed_df["weather.main.humidity"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("humidity", None)
)
weather_parsed_df["weather.main.temp_kf"] = weather_all_df["weather"].apply(
    lambda x: x["main"].get("temp_kf", None)
)

# weather weather
# - weather_weather_id
# - weather_weather_main
# - weather_weather_description
# - weather_weather_icon
weather_parsed_df["weather.weather.id"] = weather_all_df["weather"].apply(
    lambda x: x["weather"][0].get("id", None)
)
weather_parsed_df["weather.weather.main"] = weather_all_df["weather"].apply(
    lambda x: x["weather"][0].get("main", None)
)
weather_parsed_df["weather.weather.description"] = weather_all_df["weather"].apply(
    lambda x: x["weather"][0].get("description", None)
)
weather_parsed_df["weather.weather.icon"] = weather_all_df["weather"].apply(
    lambda x: x["weather"][0].get("icon", None)
)

# weather clouds
weather_parsed_df["weather.clouds.all"] = weather_all_df["weather"].apply(
    lambda x: x["clouds"].get("all", None)
)

# weather wind
weather_parsed_df["weather.wind.speed"] = weather_all_df["weather"].apply(
    lambda x: x["wind"].get("speed", None)
)
weather_parsed_df["weather.wind.deg"] = weather_all_df["weather"].apply(
    lambda x: x["wind"].get("deg", None)
)

# weather sys
weather_parsed_df["weather.sys.pod"] = weather_all_df["weather"].apply(
    lambda x: x["sys"].get("pod", None)
)

# weather dt_tx
weather_parsed_df["weather.dt_tx"] = weather_all_df["weather"].apply(
    lambda x: x.get("dt_txt", None)
)

In [None]:
weather_parsed_df.head(3)

In [None]:
print(
    "Number of weather records (w/ duplicates): {}".format(weather_parsed_df.shape[0])
)
weather_parsed_df.drop_duplicates(subset=["id"], keep="first", inplace=True)
print(
    "Number of weather records (no duplicates): {}".format(weather_parsed_df.shape[0])
)

In [None]:
weather_parsed_df.columns

In [None]:
weather_parsed_df.head(3)

In [None]:
new_weather_parsed_df = weather_parsed_df[
    [
        "id",
        "requestTimestamp",
        "city",
        "weather.dt",
        "weather.main.temp",
        "weather.main.temp_min",
        "weather.main.temp_max",
        "weather.main.pressure",
        "weather.main.sea_level",
        "weather.main.grnd_level",
        "weather.main.humidity",
        "weather.main.temp_kf",
        "weather.weather.id",
        "weather.weather.main",
        "weather.weather.description",
        "weather.weather.icon",
        "weather.clouds.all",
        "weather.wind.speed",
        "weather.wind.deg",
        "weather.sys.pod",
        "weather.dt_tx",
    ]
]

In [None]:
weather_parsed_df.to_pickle(out_path + "weather_parsed_df.pkl")

In [None]:
new_weather_parsed_filename = "weather_raw.csv"
new_weather_parsed_filename_path = out_path + new_weather_parsed_filename

In [None]:
new_weather_parsed_df.to_csv(new_weather_parsed_filename_path, index=False)

### Weather scenario data

1. neutral/good
     - clouds: none or clear sky
     - rain: none or light
     - wind: light breeze
     - temperature: comfortable temperature
2. cold
    - clouds: any
    - rain: any
    - wind: any
    - temperature: cool

3. warm
    - clouds: any
    - rain: any
    - wind: any
    - temperature: warm

4. uncomfortable temperature
    - clouds: any
    - rain: any
    - wind: any
    - temperature: uncomfortably cold or uncomfortably hot

5. rainy/snowy
    - clouds: any
    - rain: moderate or heavy
    - wind: any
    - temperature: cool

6. cloudy
    - clouds: partially cloudy or completely cloudy
    - rain: any
    - wind: any
    - temperature: any

7. windy
    - clouds: any
    - rain: any
    - wind: strong breeze or gale
    - temperature: any


This is the structure of weather scenario metadata:

```python
weather_scenarios = \
{'neutral/good':
  {'clouds': ['none', 'clear sky'],
   'precipitation': ['none', 'light'],
   'wind': ['light breeze'],
   'temperature': ['comfortable']
   },
 'cold':
  {'clouds': ['any'],
   'precipitation': ['any'],
   'wind': ['any'],
   'temperature': ['cool']
   },
 'warm':
  {'clouds': ['any'],
   'precipitation': ['any'],
   'wind': ['any'],
   'temperature': ['warm']
   },
 'uncomfortable temperature':
  {'clouds': ['any'],
   'precipitation': ['any'],
   'wind': ['any'],
   'temperature': ['uncomfortably cold', 'uncomfortably hot']
   },
 'rainy/snowy':
  {'clouds': ['any'],
   'precipitation': ['moderate', 'heavy'],
   'wind': ['any'],
   'temperature': ['any']
   },
 'cloudy':
  {'clouds': ['partially cloudy', 'completely cloudy'],
   'precipitation': ['any'],
   'wind': ['any'],
   'temperature': ['any']
   },
 'windy':
  {'clouds': ['any'],
   'precipitation': ['any'],
   'wind': ['strong breeze', 'gale'],
   'temperature': ['any']
   }
}
```

Save to `meta-data` to `weather_scenarios.json`:

```python
weather_scenarios_metadata_filename = 'weather_scenarios.json'
weather_scenarios_metadata_path = os.path.join(meta_data_path, weather_scenarios_metadata_filename)
with open(weather_scenarios_metadata_path, 'w+') as outfp:
    json.dump(weather_scenarios, outfp)
```

In [None]:
weather_scenarios = {
    "neutral/good": {
        "clouds": ["none", "clear sky"],
        "precipitation": ["none", "light"],
        "wind": ["light breeze"],
        "temperature": ["comfortable"],
    },
    "cold": {
        "clouds": ["any"],
        "precipitation": ["any"],
        "wind": ["any"],
        "temperature": ["cool"],
    },
    "warm": {
        "clouds": ["any"],
        "precipitation": ["any"],
        "wind": ["any"],
        "temperature": ["warm"],
    },
    "uncomfortable temperature": {
        "clouds": ["any"],
        "precipitation": ["any"],
        "wind": ["any"],
        "temperature": ["uncomfortably cold", "uncomfortably hot"],
    },
    "rainy/snowy": {
        "clouds": ["any"],
        "precipitation": ["moderate", "heavy"],
        "wind": ["any"],
        "temperature": ["any"],
    },
    "cloudy": {
        "clouds": ["partially cloudy", "completely cloudy"],
        "precipitation": ["any"],
        "wind": ["any"],
        "temperature": ["any"],
    },
    "windy": {
        "clouds": ["any"],
        "precipitation": ["any"],
        "wind": ["strong breeze", "gale"],
        "temperature": ["any"],
    },
}

weather_scenarios_metadata_filename = "weather_scenarios.json"
weather_scenarios_metadata_path = os.path.join(
    meta_data_path, weather_scenarios_metadata_filename
)
with open(weather_scenarios_metadata_path, "w+") as outfp:
    json.dump(weather_scenarios, outfp)

#### Clouds

This is the structure of cloud metadata:

```python
weather_scenarios_clouds = \
{'clear sky': {'category': 'clear sky',
               'main': 'clear'
               },
 'few clouds': {'category': 'partially cloudy',
                'main': 'clear'
                },
 'scattered clouds': {'category': 'partially cloudy',
                      'main': 'clouds'
                      },
 'broken clouds': {'category': 'partially cloudy',
                   'main': 'clouds'
                   },
 'overcast clouds': {'category': 'completely cloudy',
                     'main': 'clouds'
                     }
}
```

Save to `meta-data` to `weather_scenarios_clouds.json`:

```python
weather_scenarios_clouds_filename = 'weather_scenarios_clouds.json'
weather_scenarios_clouds_path = os.path.join(meta_data_path, weather_scenarios_clouds_filename )
with open(weather_scenarios_clouds_path, 'w+') as outfp:
    json.dump(weather_scenarios_clouds, outfp)
```

#### Precipitation

This is the structure of precipitation metadata:

```python
weather_scenarios_precipitation = \
{'light rain': {'category': 'light',
                'main': 'rain'
                },
 'moderate rain': {'category': 'moderate',
                   'main': 'rain'
                   },
 'heavy intensity rain': {'category': 'heavy',
                          'main': 'rain'
                          },
 'light snow': {'category': 'light',
                'main': 'snow'
                },
 'snow': {'category': 'heavy',
          'main': 'snow'
          }
 }
```

Save to `meta-data` to `weather_scenarios_precipitation.json`:

```python
weather_scenarios_precipitation_filename = 'weather_scenarios_precipitation.json'
weather_scenarios_precipitation_path = os.path.join(meta_data_path, weather_scenarios_precipitation_filename )
with open(weather_scenarios_precipitation_path, 'w+') as outfp:
    json.dump(weather_scenarios_precipitation, outfp)
```

#### Wind

This is the structure of wind metadata:

```python
weather_scenarios_wind = \
{'calm': {'category': 'light breeze',
          'speed': [0, 0.5],
          'beaufort number': 0,
          },
 'light air': {'category': 'light breeze',
               'speed': [0.5, 1.5],
               'beaufort number': 1,
               },
 'light breeze': {'category': 'light breeze',
                  'speed': [1.5, 3.3],
                  'beaufort number': 2,
                  },
 'gentle breeze': {'category': 'light breeze',
                   'speed': [3.3, 5.5],
                   'beaufort number': 3,
                   },
 'moderate breeze': {'category': 'strong breeze',
                     'speed': [5.5, 7.9],
                     'beaufort number': 4,
                     },
 'fresh breeze': {'category': 'strong breeze',
                  'speed': [7.9, 10.7],
                  'beaufort number': 5,
                  },
 'strong breeze': {'category': 'strong breeze',
                   'speed': [10.7, 13.8],
                   'beaufort number': 6,
                   },
 'high wind': {'category': 'gale',
               'speed': [13.8, 17.1],
               'beaufort number': 7,
               },
 'gale': {'category': 'gale',
          'speed': [17.1, 20.7],
          'beaufort number': 8,
          },
 'strong/severe gale': {'category': 'gale',
                        'speed': [20.7, 24.4],
                        'beaufort number': 9,
                        },
 'storm': {'category': 'gale',
           'speed': [24.4, 28.4],
           'beaufort number': 10,
           },
 'violent storm': {'category': 'gale',
                   'speed': [28.4, 32.6],
                   'beaufort number': 11,
                   },
 'hurricane force': {'category': 'gale',
                     'speed': [32.6, 50],
                     'beaufort number': 12,
                     }
 }
```

Save to `meta-data` to `weather_scenarios_wind.json`:

```python
weather_scenarios_wind_filename = 'weather_scenarios_wind.json'
weather_scenarios_wind_path = os.path.join(meta_data_path, weather_scenarios_wind_filename )
with open(weather_scenarios_wind_path, 'w+') as outfp:
    json.dump(weather_scenarios_wind, outfp)
```

#### Temperature and Apparent Temperature

Apparent temperature (AT) is the temperature equivalent perceived by humans, caused by the combined effects of air temperature, relative humidity and wind speed. Apparent Temperature was invented by Robert Steadman and it was published in 1984. It takes into consideration four environmental factors: wind, temperature, humidity and radiation from the sun.

The formula for the AT is:
$$AT = T_a + 0.348 \cdot e - 0.70 \cdot ws + 0.70\frac{Q}{ws+10}- 4.25$$

where:
- $Ta =$ Dry bulb temperature (°C)
- $e =$ Water vapour pressure (hPa) (humidity)
- $ws =$ Wind speed (m/s) at an elevation of 10 meters
- $Q =$ Net radiation absorbed per unit area of body surface (W/m2)

$E$ is computed like this
$$e=\frac{rh}{100} \cdot 6.105 \cdot e^{\frac{17.27 \cdot T_a}{237.7 + T_a}}$$

where:
- $rh =$ Relative Humidity (%)


In [None]:
# Water vapour pressure
# e=\frac{rh}{100}*6.105*e^{\frac{17.27*T_a}{237.7+T_a}}
def water_vapour_pressure(rh, Ta):
    return (rh / 100.0) * 6.105 * math.exp((17.27 * Ta) / (237.7 + Ta))


# apparent temperature
# AT=T_a+0.348*e-0.70*ws+0.70\frac{Q}{ws+10}-4.25
def apparent_temperature(rh, Ta, ws, Q):
    e = water_vapour_pressure(rh, Ta)

    return Ta + 0.348 * e - 0.70 * ws + 0.70 * (Q / (ws + 10.0)) - 4.25


# get_net_radiation_wrap = lambda row: get_net_radiation(row.time, row.centroid_x, row.centroid_y)
apparent_temperature_wrap = lambda row: apparent_temperature(
    row.rh, row.Ta, row.ws, row.Q
)

This is the structure of apparent temperature metadata:

```python
weather_scenarios_apparent_temperature = \
{
  "uncomfortably cold": {
    "main": "uncomfortable temperature",
    "range": [-273.15, 0.0],
  },
  "cool": {
    "main": "comfortable temperature",
    "range": [0.0, 15.0],
  },
  "comfortable": {
    "main": "comfortable temperature",
    "range": [15.0, 25.0],
  },
  "warm": {
    "main": "comfortable temperature",
    "range": [25.0, 32.0],
  },
  "uncomfortably hot": {
    "main": "uncomfortable temperature",
    "range": [32.0, 100.0],
  }
}
```

Save to `meta-data` to `weather_scenarios_apparent_temperature.json`:

```python
weather_scenarios_apparent_temperature_filename = 'weather_scenarios_apparent_temperature.json'
weather_scenarios_apparent_temperature_path = os.path.join(meta_data_path, weather_scenarios_apparent_temperature_filename )
with open(weather_scenarios_apparent_temperature_path, 'w+') as outfp:
    json.dump(weather_scenarios_apparent_temperature, outfp)
```

### Read metadata

#### Weather

In [None]:
weather_scenarios_metadata_filename = "weather_scenarios.json"
weather_scenarios_metadata_path = os.path.join(
    meta_data_path, weather_scenarios_metadata_filename
)
with open(weather_scenarios_metadata_path, "r") as infp:
    weather_scenarios = json.load(infp)

In [None]:
pprint(weather_scenarios)

weather_parsed_df.columns

#### Clouds

In [None]:
weather_scenarios_clouds_filename = "weather_scenarios_clouds.json"
weather_scenarios_clouds_path = os.path.join(
    meta_data_path, weather_scenarios_clouds_filename
)

with open(weather_scenarios_clouds_path, "r") as infp:
    weather_scenarios_clouds = json.load(infp)

In [None]:
pprint(weather_scenarios_clouds)

#### Precipitation

In [None]:
weather_scenarios_precipitation_filename = "weather_scenarios_precipitation.json"
weather_scenarios_precipitation_path = os.path.join(
    meta_data_path, weather_scenarios_precipitation_filename
)

with open(weather_scenarios_precipitation_path, "r") as infp:
    weather_scenarios_precipitation = json.load(infp)

In [None]:
pprint(weather_scenarios_precipitation)

#### Wind

In [None]:
weather_scenarios_wind_filename = "weather_scenarios_wind.json"
weather_scenarios_wind_path = os.path.join(
    meta_data_path, weather_scenarios_wind_filename
)

with open(weather_scenarios_wind_path, "r") as infp:
    weather_scenarios_wind = json.load(infp)

In [None]:
pprint(weather_scenarios_wind)

#### Temperature and Apparent Temperature

In [None]:
weather_scenarios_apparent_temperature_filename = (
    "weather_scenarios_apparent_temperature.json"
)
weather_scenarios_apparent_temperature_path = os.path.join(
    meta_data_path, weather_scenarios_apparent_temperature_filename
)

with open(weather_scenarios_apparent_temperature_path, "r") as infp:
    weather_scenarios_apparent_temperature = json.load(infp)

In [None]:
pprint(weather_scenarios_apparent_temperature)

#### Weather

In [None]:
weather_parsed_df[["weather.weather.main", "id"]].groupby(
    "weather.weather.main", as_index=False
).count()

In [None]:
weather_parsed_df[["weather.weather.description", "id"]].groupby(
    "weather.weather.description", as_index=False
).count()

#### Clouds

In [None]:
weather_parsed_df["weather.clouds.all"].hist(bins=10)

#### Wind

In [None]:
weather_parsed_df["weather.wind.speed"].hist(bins=20)

In [None]:
weather_parsed_df["weather.wind.deg"].hist(bins=360)

#### Temperature

In [None]:
weather_parsed_df["weather.main.temp"].hist(bins=50)

In [None]:
weather_parsed_df["weather.main.temp_min"].hist(bins=50)

In [None]:
weather_parsed_df["weather.main.temp_max"].hist(bins=60)

In [None]:
weather_parsed_df["weather.main.temp_kf"].hist(bins=50)

In [None]:
weather_parsed_df["weather.main.humidity"].hist(bins=20)

### Compute Apparent Temperature

#### Read shapefiles

In [None]:
shapes_df_all = pd.read_pickle(input_path + "shapes_df_all.pkl")

In [None]:
shapes_df_all.head(3)

In [None]:
gps_cities = pd.read_pickle(input_path + "gps_cities.pkl")

In [None]:
gps_cities.head(3)

#### Read net radiation data

**NASA - NEO: NASA Earth Observatory**

* [NASA NEO](https://neo.sci.gsfc.nasa.gov/)
  * Dataset description: [Net radiation data](https://neo.sci.gsfc.nasa.gov/view.php?datasetId=CERES_NETFLUX_D&date=2019-12-01),
  * Data-like File Formats: [CSV and floating point GeoTIFFs description](https://neo.sci.gsfc.nasa.gov/blog/2013/12/23/csv-and-floating-point-geotiffs/)
  * [Bulk data download](https://neo.sci.gsfc.nasa.gov/about/bulk.php)
    * [FTP](https://neo.sci.gsfc.nasa.gov/archive/geotiff.float/CERES_NETFLUX_D/) ([README.txt](https://neo.sci.gsfc.nasa.gov/archive/geotiff.float/README.txt))

In [None]:
net_radiation_filename = "CERES_NETFLUX_D_2019-04-08.FLOAT.TIFF"
net_radition_tiff_path = os.path.join(net_radition_data_path, net_radiation_filename)

print(net_radition_tiff_path)

In [None]:
DATE_REGEX = re.compile(r"CERES_NETFLUX_D_(.+)\.FLOAT\.TIFF")


def tiff_get_date(tiff_filename):
    file_file_basename = os.path.basename(tiff_filename)
    tiff_date_str = DATE_REGEX.match(file_file_basename).group(1)
    tiff_date = datetime.strptime(tiff_date_str, "%Y-%m-%d")

    return tiff_date

In [None]:
tiff_files = glob(net_radition_data_path + "CERES_NETFLUX_D_*.FLOAT.TIFF")

parsed_tiff_files = {}
for tiff_file in sorted(tiff_files):
    tiff_date = tiff_get_date(tiff_file)
    tiff_date_str = tiff_date.strftime("%Y-%m-%d")

    dataset = None
    with rasterio.open(net_radition_tiff_path) as dataset:

        # Read the dataset's valid data mask as a ndarray.
        mask = dataset.dataset_mask()

        # Extract feature shapes and values from the array.
        for geom, val in rasterio.features.shapes(mask, transform=dataset.transform):

            # Transform shapes from the dataset's own coordinate
            # reference system to CRS84 (EPSG:4326).
            geom = rasterio.warp.transform_geom(
                dataset.crs, "EPSG:4326", geom, precision=6
            )

            # Print GeoJSON shapes to stdout.
            # print(geom)

        parsed_tiff_files[tiff_date_str] = {
            "band1": dataset.read(1),
            "index": dataset.index,
        }

del dataset

In [None]:
import bisect

# Find the closest Key in dictionary
# https://www.geeksforgeeks.org/python-find-the-closest-key-in-dictionary/


def get_closest_key(mydict, key):
    res = mydict.get(key, None)
    if res is None:
        dict_keys = sorted([datetime.strptime(k, "%Y-%m-%d") for k in mydict.keys()])

    while res is None:
        residx = bisect.bisect_left(dict_keys, datetime.strptime(key, "%Y-%m-%d"))

        if residx < 0:
            residx = 0
        elif residx >= len(dict_keys):
            residx = len(dict_keys) - 1

        reskey = dict_keys[residx]
        res = mydict[reskey.strftime("%Y-%m-%d")]

    return res

##### Example

In [None]:
print("* '2018-12-31' is None:", parsed_tiff_files.get("2018-12-31", None) is None)
print("* '2019-12-17' is None:", parsed_tiff_files.get("2019-12-17", None) is None)
print("* '2019-12-18' is None:", parsed_tiff_files.get("2019-12-18", None) is None)
print("* '2019-12-29' is None:", parsed_tiff_files.get("2019-12-29", None) is None)
print("* '2019-12-30' is None:", parsed_tiff_files.get("2019-12-30", None) is None)

In [None]:
print(
    "* '2018-12-31' == '2019-01-01': ",
    (
        (
            get_closest_key(parsed_tiff_files, "2018-12-31").get("band1", None)
            is not None
        )
        and (
            get_closest_key(parsed_tiff_files, "2018-12-31").get("band1", None)
            == parsed_tiff_files.get("2019-01-01", None).get("band1", None)
        ).all()
    ),
)

print(
    "* '2019-12-18' == '2019-12-17': ",
    (
        (
            get_closest_key(parsed_tiff_files, "2019-12-18").get("band1", None)
            is not None
        )
        and (
            get_closest_key(parsed_tiff_files, "2019-12-18").get("band1", None)
            == parsed_tiff_files.get("2019-12-17", None).get("band1", None)
        ).all()
    ),
)

print(
    "* '2019-12-30' == '2019-12-17': ",
    (
        (
            get_closest_key(parsed_tiff_files, "2019-12-18").get("band1", None)
            is not None
        )
        and (
            get_closest_key(parsed_tiff_files, "2019-12-30").get("band1")
            == parsed_tiff_files.get("2019-12-17", None).get("band1", None)
        ).all()
    ),
)

In [None]:
# Example data
example_tiff_dataset = parsed_tiff_files["2019-01-01"]

# x, y = -64.6861800, 10.1362500
y, x = 0, 0
row, col = example_tiff_dataset["index"](x, y)
example_tiff_dataset["band1"][row, col]

In [None]:
weather_parsed_df.head(3)

In [None]:
weather_cities = list(weather_parsed_df["city"].unique())
print("Number of distinct cities:", len(weather_cities))

Mail from André "MoTiV - base de dados sobre temperatura ambiente" on Thu, 19 Mar 2020 14:51:23

In [None]:
citiesPRT = ["Lisbon", "Porto"]
citiesSVK = [
    "Žilina",
    "Bratislava",
    "Trnava",
    "Nitra",
    "Trenčín",
    "Banská Bystrica",
    "Košice",
    "Prešov",
]
citiesFIN = ["Helsinki", "Tampere", "Turku", "Oulu", "Etelä-Suomi"]
citiesESP = ["Barcelona", "Girona", "Tarragona", "Lleida"]
citiesBEL = ["Antwerp", "Brugge", "Brussels", "Gent", "Leuven"]
citiesCHE = [
    "Lausanne",
    "Genève",
    "Montreux",
    "Fribourg",
    "Bern",
    "Basel",
    "Zurich",
    "Neuchâtel",
    "Yverdon-les-Bains",
]
citiesITA = ["Milan"]
citiesFRA = [
    "Paris",
    "Lyon",
    "Grenoble",
    "Nevers",
    "Nantes",
    "Bordeaux",
    "Toulouse",
    "Strasbourg",
    "Amiens",
    "Angers",
    "Lille",
    "Brest",
    "Marseille",
    "Saint Brieuc",
    "Montpellier",
]
citiesNOR = [
    "Oslo",
    "Bergen",
    "Trondheim",
    "Stavager",
    "Drammen",
    "Fredrikstad",
    "Porsgrunn",
    "Skien",
    "Kristiansand",
    "Ålesund",
    "Tønsberg",
]
citiesHRV = [
    "Zagreb",
    "Velika Gorica",
    "Samobor",
    "Zaprešić",
    "Dugo selo",
    "Zagrebačka županija",
    "Split",
    "Rijeka",
    "Osijek",
    "Varaždin",
    "Zadar",
]

mail_cities_lists = [
    citiesPRT,
    citiesSVK,
    citiesFIN,
    citiesESP,
    citiesBEL,
    citiesCHE,
    citiesITA,
    citiesFRA,
    citiesNOR,
    citiesHRV,
]

In [None]:
mail_all_cities = flatten(mail_cities_lists)
print("Number of distinct cities:", len(mail_all_cities))

In [None]:
print("weather_cities-mail_all_cities:", set(weather_cities) - set(mail_all_cities))
print("mail_all_cities-weather_cities:", set(mail_all_cities) - set(weather_cities))

In [None]:
weather_parsed_df.head(3)

In [None]:
weather_city_date = weather_parsed_df[["id", "city", "weather.dt_tx"]].copy()

weather_city_date["date"] = weather_city_date["weather.dt_tx"].apply(lambda x: x[:10])
weather_city_date.drop(["weather.dt_tx"], axis=1, inplace=True)
weather_city_date.drop_duplicates(inplace=True)

print("weather_city_date.shape:", weather_city_date.shape)

In [None]:
weather_city_date.head(3)

In [None]:
weather_city_date.groupby("city").agg({"date": [np.min, np.max]})

In [None]:
shapes_df_all.head(3)

In [None]:
shapes_city_centroid = shapes_df_all[
    ["fuaname", "fuaname_en", "iso3", "centroid_x", "centroid_y"]
].copy()
shapes_city_centroid.columns = [
    "city",
    "city_en",
    "country",
    "centroid_x",
    "centroid_y",
]

print("shapes_city_centroid.shape:", shapes_city_centroid.shape)

In [None]:
# missing_cities = (
#     set(weather_cities)
#     - set(shapes_city_centroid.city.values)
#     - set(shapes_city_centroid.city_en.values)
# )
# print("Number of missing cities:", len(missing_cities))

In [None]:
# missing_cities

In [None]:
tmp1_df = pd.merge(
    weather_city_date,
    shapes_city_centroid,
    left_on="city",
    right_on="city",
    how="left",
)
tmp1_df.drop(["city_en"], axis=1, inplace=True)

tmp2_df = pd.merge(
    weather_city_date,
    shapes_city_centroid,
    left_on="city",
    right_on="city_en",
    how="left",
)
tmp2_df.drop(["city_y", "city_en"], axis=1, inplace=True)

# rename columns in tmp2_df
tmp2_df.columns = tmp1_df.columns

In [None]:
tmp1_df.head(3)

In [None]:
tmp2_df.head(3)

In [None]:
weather_city_date_centroid = pd.concat([tmp1_df, tmp2_df])

In [None]:
weather_city_date_centroid.head(5)

In [None]:
# How to drop duplicates but keep the rows if a particular other column is not null (Pandas)
# See:
# https://stackoverflow.com/a/56852739/2377454
weather_city_date_centroid = weather_city_date_centroid.sort_values(
    by=["id", "country"], na_position="last"
).drop_duplicates("id", keep="first")

In [None]:
weather_city_date_centroid.head(5)

In [None]:
weather_city_date_centroid[
    weather_city_date_centroid["id"] == "5d23820c5c15650738bafbfa"
]

In [None]:
print("weather_city_date_centroid.shape:", weather_city_date_centroid.shape)

In [None]:
weather_city_date_centroid.head(3)

For some cities we don't have coordinates and they have 'country', 'centroid_x', 'centroid_y' set to NaN

In [None]:
weather_city_date_centroid[weather_city_date_centroid["country"].isnull()].head(3)

In [None]:
def get_net_radiation(date, x, y):
    val = 0

    if (not pd.isnull(x)) and (not pd.isnull(y)):
        tiff = get_closest_key(parsed_tiff_files, date)

        row, col = tiff["index"](x, y)
        val = tiff["band1"][row, col]

    return val


get_net_radiation_wrap = lambda row: get_net_radiation(
    row.date, row.centroid_x, row.centroid_y
)

In [None]:
weather_city_date_centroid[
    weather_city_date_centroid["id"] == "5d2382c05c15650738bafc0e"
]

In [None]:
get_net_radiation("2019-07-08", 2.094905, 41.439964)

In [None]:
weather_city_date_centroid[
    weather_city_date_centroid["id"] == "5d2382c05c15650738bafc0e"
][["date", "centroid_x", "centroid_y"]].apply(get_net_radiation_wrap, axis=1)

In [None]:
weather_city_date_centroid["net_radiation"] = weather_city_date_centroid[
    ["date", "centroid_x", "centroid_y"]
].apply(get_net_radiation_wrap, axis=1)

In [None]:
weather_city_date_centroid.head(3)

In [None]:
weather_city_date_centroid[weather_city_date_centroid["country"].isnull()].head(3)

In [None]:
weather_netradiation_df = weather_parsed_df.merge(
    weather_city_date_centroid, on=["id", "city"]
)

In [None]:
weather_netradiation_df.head(3)

In [None]:
at_data_df = weather_netradiation_df[
    [
        "id",
        "weather.main.humidity",
        "weather.main.temp",
        "weather.wind.speed",
        "net_radiation",
    ]
].copy()
at_data_df.columns = ["id", "rh", "Ta", "ws", "Q"]
at_data_df.head(3)

In [None]:
# def apparent_temperature(rh, Ta, ws, Q):
at_data_df["apparent_temperature"] = at_data_df[["id", "rh", "Ta", "ws", "Q"]].apply(
    apparent_temperature_wrap, axis=1
)

at_data_df.head(3)

In [None]:
tmp_final = at_data_df[["id", "apparent_temperature"]].merge(
    weather_netradiation_df, on=["id"]
)

tmp_final.head(3)

In [None]:
tmp_final.columns

In [None]:
weather_final_df = tmp_final[
    [
        "id",
        "requestTimestamp",
        "city",
        "centroid_x",
        "centroid_y",
        "country",
        "weather.dt",
        "weather.dt_tx",
        "date",
        "weather.main.temp",
        "weather.main.temp_min",
        "weather.main.temp_max",
        "weather.main.temp_kf",
        "weather.main.pressure",
        "weather.main.sea_level",
        "weather.main.grnd_level",
        "weather.main.humidity",
        "weather.weather.id",
        "weather.weather.main",
        "weather.weather.description",
        "weather.weather.icon",
        "weather.clouds.all",
        "weather.wind.speed",
        "weather.wind.deg",
        "weather.sys.pod",
        "apparent_temperature",
        "net_radiation",
    ]
].copy()

weather_final_df.columns = [
    "id",
    "request_timestamp",
    "city",
    "centroid_x",
    "centroid_y",
    "country",
    "timestamp_seconds",
    "timestamp_string",
    "date",
    "temp",
    "temp_min",
    "temp_max",
    "temp_kf",
    "pressure",
    "sea_level",
    "grnd_level",
    "humidity",
    "weather_id",
    "weather_main",
    "weather_description",
    "weather_icon",
    "clouds_all",
    "wind_speed",
    "wind_deg",
    "sys_pod",
    "apparent_temperature",
    "net_radiation",
]

weather_final_df.head(3)

In [None]:
weather_final_df.id.nunique()

## Detect weather scenarios

#### Clouds

In [None]:
pprint(weather_scenarios_clouds)

In [None]:
def map_cloud_category(description):

    key = description.lower()
    cloud_category = None
    cloud_main = None
    if weather_scenarios_clouds.get(key, None) is not None:
        cloud_category = weather_scenarios_clouds[key]["category"]
        cloud_main = weather_scenarios_clouds[key]["main"]

    return cloud_category, cloud_main

In [None]:
# unpacking function return into pandas dataframe columns
# See:
# https://stackoverflow.com/a/43009150/2377454
weather_final_df["cloud_category"], weather_final_df["cloud_main"] = zip(
    *weather_final_df["weather_description"].apply(map_cloud_category)
)

In [None]:
weather_final_df[["weather_description", "cloud_category", "cloud_main"]].head(10)

#### Precipitation

In [None]:
pprint(weather_scenarios_precipitation)

In [None]:
def map_precipitation_category(description):

    key = description.lower()
    precipitation_category = None
    precipitation_main = None
    if weather_scenarios_precipitation.get(key, None) is not None:
        precipitation_category = weather_scenarios_precipitation[key]["category"]
        precipitation_main = weather_scenarios_precipitation[key]["main"]

    return precipitation_category, precipitation_main

In [None]:
# unpacking function return into pandas dataframe columns
# See:
# https://stackoverflow.com/a/43009150/2377454
(
    weather_final_df["precipitation_category"],
    weather_final_df["precipitation_main"],
) = zip(*weather_final_df["weather_description"].apply(map_precipitation_category))

In [None]:
weather_final_df[
    ["weather_description", "precipitation_category", "precipitation_main"]
].head(10)

#### Wind

In [None]:
pprint(weather_scenarios_wind)

In [None]:
wind_speeds = [
    v_min
    for (v_min, v_max) in [
        weather_scenarios_wind[k]["speed"] for k in weather_scenarios_wind.keys()
    ]
]

print("wind_speeds:", wind_speeds)

In [None]:
tmp_wind_ranges = {
    weather_scenarios_wind[k]["speed"][0]: {
        "beaufort number": weather_scenarios_wind[k]["beaufort number"],
        "description": k,
        "category": weather_scenarios_wind[k]["category"],
    }
    for k in weather_scenarios_wind.keys()
}


weather_scenarios_wind_ranges = OrderedDict()
for v in wind_speeds:
    weather_scenarios_wind_ranges[v] = tmp_wind_ranges[v]

In [None]:
pprint(weather_scenarios_wind_ranges)

In [None]:
print("wind_speeds:", wind_speeds)

In [None]:
# Given boundaries, find interval
# See:
# https://stackoverflow.com/a/13942715/2377454
def map_wind_category(wind_speed):

    if wind_speed <= 0:
        wind_speed = 0

    wind_speed_corrected = [float(v - 0.001) for v in wind_speeds]

    pos = bisect.bisect_left(wind_speed_corrected, wind_speed)

    # since we start from zero, we need to shift by 1
    pos = pos - 1

    if pos >= len(wind_speed_corrected):
        pos = len(wind_speed_corrected) - 1
    elif pos <= 0:
        pos = 0

    key = wind_speeds[pos]
    # print('key: {}, pos: {}'.format(key, pos))

    wind_category = None
    wind_description = None
    wind_beaufort_number = None
    if weather_scenarios_wind_ranges.get(key, None) is not None:
        wind_category = weather_scenarios_wind_ranges[key]["category"]
        wind_description = weather_scenarios_wind_ranges[key]["description"]
        wind_beaufort_number = weather_scenarios_wind_ranges[key]["beaufort number"]

    return wind_category, wind_description, wind_beaufort_number

In [None]:
for v in np.arange(0, 33, 0.1):
    cat, desc, num = map_wind_category(v)
    print(
        "map_wind_category({}) -> (cat: {}, desc: {}, num: {})".format(
            v, cat, desc, num
        )
    )

In [None]:
weather_final_df["wind_speed"].head(3)

In [None]:
(
    weather_final_df["wind_category"],
    weather_final_df["wind_description"],
    weather_final_df["wind_beaufort_number"],
) = zip(*weather_final_df["wind_speed"].apply(map_wind_category))

In [None]:
weather_final_df[
    ["wind_speed", "wind_category", "wind_description", "wind_beaufort_number"]
].head(5)

#### Temperature

In [None]:
pprint(weather_scenarios_apparent_temperature)

In [None]:
temperature_ranges = [
    t_min
    for (t_min, t_max) in [
        weather_scenarios_apparent_temperature[k]["range"]
        for k in weather_scenarios_apparent_temperature.keys()
    ]
]

print("temperature_ranges:", temperature_ranges)

In [None]:
tmp_temperature_ranges = {
    weather_scenarios_apparent_temperature[k]["range"][0]: {
        "category": k,
        "main": weather_scenarios_apparent_temperature[k]["main"],
    }
    for k in weather_scenarios_apparent_temperature.keys()
}


weather_scenarios_apparent_temperature_ranges = OrderedDict()
for t in temperature_ranges:
    weather_scenarios_apparent_temperature_ranges[t] = tmp_temperature_ranges[t]

In [None]:
pprint(weather_scenarios_apparent_temperature_ranges)

In [None]:
def map_temperature_category(temperature):

    temperature_ranges_corrected = [float(t - 0.001) for t in temperature_ranges]

    pos = bisect.bisect_left(temperature_ranges_corrected, temperature)

    # since we start from zero, we need to shift by 1
    pos = pos - 1

    if pos >= len(temperature_ranges_corrected):
        pos = len(temperature_ranges_corrected) - 1
    elif pos <= 0:
        pos = 0

    key = temperature_ranges[pos]
    # print('key: {}, pos: {}'.format(key, pos))

    temperature_category = None
    temperature_main = None
    if weather_scenarios_apparent_temperature_ranges.get(key, None) is not None:
        temperature_category = weather_scenarios_apparent_temperature_ranges[key][
            "category"
        ]
        temperature_main = weather_scenarios_apparent_temperature_ranges[key]["main"]

    return temperature_category, temperature_main

In [None]:
for t in np.arange(-10, 40, 2):
    cat, main = map_temperature_category(t)
    print("map_temperature_category({}) -> (cat: {}, desc: {})".format(t, cat, main))

In [None]:
weather_final_df.columns

In [None]:
weather_final_df["apparent_temperature"].head(3)

In [None]:
(
    weather_final_df["temperature_category"],
    weather_final_df["temperature_description"],
) = zip(*weather_final_df["apparent_temperature"].apply(map_temperature_category))

In [None]:
weather_final_df[
    ["apparent_temperature", "temperature_category", "temperature_description"]
].head(5)

### Weather Scenarios

In [None]:
weather_final_df.columns

In [None]:
weather_final_df[
    [
        "cloud_category",
        "precipitation_category",
        "temperature_category",
        "temperature_description",
    ]
]

In [None]:
pprint(weather_scenarios)

In [None]:
# Takes two arguments:
#   * weather_characteristic: one of clouds, rain, wind, or temperature
#   * weather_condition: the condition for that weather characteristics
# return the matching scenarios
def match_scenarios(weather_characteristic, weather_condition):

    wcond = weather_condition
    if weather_condition is None:
        wcond = "none"
    wcond = wcond.lower()

    matching_scenarios = set()
    for scenario, weather_dict in weather_scenarios.items():
        if "any" in map(
            str.lower, weather_dict[weather_characteristic]
        ) or wcond in map(str.lower, weather_dict[weather_characteristic]):

            matching_scenarios.add(scenario)

    return matching_scenarios

In [None]:
args = ("clouds", "clear sky")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("clouds", None)
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("clouds", "completely cloudy")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("clouds", "partially cloudy")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))

In [None]:
args = ("precipitation", "light")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("precipitation", None)
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("precipitation", "moderate")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("precipitation", "heavy")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))

In [None]:
args = ("wind", None)
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("wind", "light breeze")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("wind", "strong breeze")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("wind", "gale")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))

In [None]:
args = ("temperature", "comfortable")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("temperature", "cool")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("temperature", "warmºa")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))
args = ("temperature", "uncomfortably cold")
print("{} is {} -> {}".format(*args, match_scenarios(*args)))

In [None]:
def map_weather_scenarios(clouds, precipitation, wind, temperature):
    match_clouds = match_scenarios("clouds", clouds)
    match_precipitation = match_scenarios("precipitation", precipitation)
    match_wind = match_scenarios("wind", wind)
    match_temperature = match_scenarios("temperature", temperature)

    match_scenario = set.intersection(
        match_clouds, match_precipitation, match_wind, match_temperature
    )

    scenario = list()
    if len(match_scenario) >= 1:
        scenario = sorted(match_scenario)

    return scenario


# map_weather_scenarios = lambda row: map_weather_scenarios(louds, precipitation, wind, temperature)
map_weather_scenarios_wrap = lambda row: map_weather_scenarios(
    row.cloud_category,
    row.precipitation_category,
    row.wind_category,
    row.temperature_category,
)

In [None]:
args = ("clear sky", None, "light breeze", "comfortable")
print(map_weather_scenarios(*args))
args = ("clear sky", None, "gale", "comfortable")
print(map_weather_scenarios(*args))
args = ("partially cloudy", None, "light breeze", "comfortable")
print(map_weather_scenarios(*args))

In [None]:
weather_final_df["weather_scenario"] = weather_final_df[
    [
        "cloud_category",
        "precipitation_category",
        "wind_category",
        "temperature_category",
    ]
].apply(map_weather_scenarios_wrap, axis=1)

In [None]:
weather_final_df[
    [
        "cloud_category",
        "precipitation_category",
        "wind_category",
        "temperature_category",
        "weather_scenario",
    ]
].head(10)

In [None]:
tmp = weather_final_df[
    [
        "id",
        "cloud_category",
        "precipitation_category",
        "wind_category",
        "temperature_category",
        "weather_scenario",
    ]
].copy()

tmp["weather_scenario_count"] = tmp["weather_scenario"].apply(lambda x: len(x))

In [None]:
tmp.head(3)

In [None]:
tmp.groupby("weather_scenario_count")["id"].count()

In [None]:
tmp.loc[tmp["weather_scenario_count"] == 1].head(20)

### Save to file

In [None]:
weather_final_df.to_pickle(out_path + "weather_final_df.pkl")

## Add Weather to Legs

In [None]:
legs_df[["legid", "city", "startDate"]].head(3)

In [None]:
weather_final_df[["id", "city", "timestamp_seconds"]].head(3)

In [None]:
weather_id_legid = (
    legs_df[["legid", "city", "startDate"]].merge(weather_final_df[["id", "city"]])[
        ["id", "legid", "city", "startDate"]
    ]
).copy()
weather_id_legid.columns = ["id", "legid", "city", "start_leg_ms"]

In [None]:
weather_id_legid.head(3)

In [None]:
weather_id_legid.groupby("city").count()[["id"]]

In [None]:
weather_id_legid["start_leg_seconds"] = weather_id_legid["start_leg_ms"].apply(
    lambda x: int(round(int(x) / 1000.0, 0))
)

In [None]:
weather_id_legid.head(3)

In [None]:
all_timestamps_array = np.asarray(
    sorted(set(weather_final_df["timestamp_seconds"].to_list()))
)
print("len(all_timestamps_array):", len(all_timestamps_array))

In [None]:
all_weather_cities = sorted(set(weather_final_df["city"].values))

timestamps_per_city_array = dict()

for city in all_weather_cities:
    city_timestamp_list = weather_final_df.loc[weather_final_df["city"] == city][
        "timestamp_seconds"
    ].to_list()
    city_timestamp_array = np.asarray(sorted(set(city_timestamp_list)))

    # print('{} -> {}'.format(city, len(city_timestamp_array)))
    timestamps_per_city_array[city] = city_timestamp_array

In [None]:
DAY_IN_SECONDS = 86400
DIFFLIMIT = 1 * DAY_IN_SECONDS

# Find nearest value in numpy array
# https://stackoverflow.com/a/2566508/2377454
#
# Calculate differences between timestamp and available values for a given city
# bound them to be at most 1 day.
# If no value is available return NaN.
#
def find_city_nearest_time(city, ts):
    tsarray = timestamps_per_city_array[city]

    diffarray = np.abs(tsarray - ts)
    diffarray = diffarray[np.where(diffarray <= DIFFLIMIT)]

    res = np.nan
    if diffarray.size > 0:
        idx = diffarray.argmin()
        res = tsarray[idx]

    return res


# find_city_nearest_time_wrap = lambda row: find_city_nearest_time(city, ts)
find_city_nearest_time_wrap = lambda row: find_city_nearest_time(
    row.city, row.start_leg_seconds
)

In [None]:
weather_id_legid[["city", "start_leg_seconds"]].head(3)

In [None]:
find_city_nearest_time("Brussels", 1557239471)

In [None]:
test = (
    weather_id_legid[["city", "start_leg_seconds"]][1000:2000]
    .copy()
    .reset_index(drop=True)
)
test.head(3)

In [None]:
%%timeit
test.apply(find_city_nearest_time_wrap, axis=1)

In [None]:
weather_id_legid["start_leg_closest_seconds"] = weather_id_legid[
    ["city", "start_leg_seconds"]
].apply(find_city_nearest_time_wrap, axis=1)

In [None]:
if "start_leg_ms" in weather_id_legid.columns:
    weather_id_legid.drop(["start_leg_ms"], axis=1, inplace=True)

In [None]:
(
    weather_id_legid[["id", "start_leg_seconds", "start_leg_closest_seconds"]]
    .groupby(["start_leg_seconds", "start_leg_closest_seconds"])
    .count()
).head(5)

In [None]:
weather_id_legid.head(3)

In [None]:
weather_final_df.head(3)

In [None]:
# Pandas join on columns with different names
# See:
#   https://stackoverflow.com/a/40570281/2377454
#
# ```
# pd.merge(df1, df2, left_on=  ['userid', 'column1'],
#                    right_on= ['username', 'column1'],
#                    how= 'left')
# ```

tmp_weather_with_legs_df = pd.merge(
    left=weather_id_legid,
    right=weather_final_df,
    left_on=["id", "city", "start_leg_closest_seconds"],
    right_on=["id", "city", "timestamp_seconds"],
    how="inner",
)

In [None]:
tmp_weather_with_legs_df = tmp_weather_with_legs_df.rename({"id": "weather_id"})

In [None]:
tmp_weather_with_legs_df.head(5)

In [None]:
missing_legids = set(weather_id_legid.legid.values) - set(
    tmp_weather_with_legs_df.legid.values
)
print("# of missing legids:", len(missing_legids))

missing_legids = set(legs_df.legid.values) - set(tmp_weather_with_legs_df.legid.values)
print("# of missing legids with respect to legs_df:", len(missing_legids))

In [None]:
print(
    "tmp_weather_with_legs_df-weather_final_df:",
    set(tmp_weather_with_legs_df.columns) - set(weather_final_df.columns),
)
print(
    "weather_final_df-tmp_weather_with_legs_df:",
    set(weather_final_df.columns) - set(tmp_weather_with_legs_df.columns),
)

In [None]:
new_columns = ["legid"] + list(weather_final_df.columns)

tmp_weather_with_legs_df[new_columns].head(3)

In [None]:
new_columns = [
    "id",
    "legid",
    "request_timestamp",
    "city",
    "centroid_x",
    "centroid_y",
    "country",
    "timestamp_seconds",
    "timestamp_string",
    "date",
    "weather_id",
    "weather_scenario",
    "weather_main",
    "weather_description",
    "weather_icon",
    "temp",
    "temp_min",
    "temp_max",
    "temp_kf",
    "apparent_temperature",
    "net_radiation",
    "temperature_category",
    "temperature_description",
    "pressure",
    "sea_level",
    "grnd_level",
    "humidity",
    "clouds_all",
    "cloud_category",
    "cloud_main",
    "precipitation_category",
    "precipitation_main",
    "wind_speed",
    "wind_deg",
    "wind_beaufort_number",
    "wind_category",
    "wind_description",
    "sys_pod",
]


weather_final_with_legs_df = tmp_weather_with_legs_df[new_columns]

### Save to file

In [None]:
weather_final_with_legs_df.head(3)

In [None]:
weather_final_with_legs_df.to_pickle(out_path + "weather_final_with_legs_df.pkl")