# Setup Python


In [4]:
from urllib.request import Request, urlopen
import json
from openmeteo_requests import Client
from requests_cache import CachedSession
from retry_requests import retry

from datetime import datetime, date, timedelta
import pandas as pd
import os
from shutil import rmtree


# CIM Data


In [2]:
def retrieve_CIM_data(date):
    request = Request(
        f'https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date.strftime('%Y-%m-%d')}&reportType=north',
        headers={'User-Agent': 'Mozilla/5.0'}
    )
    response = urlopen(request)

    data = json.loads(response.read().decode('utf-8'))
    return pd.json_normalize(data['hydra:member'])


## Retrieve all available CIM data


In [3]:
start_date = date(2016, 10, 1)
end_date = datetime.today() - timedelta(days=31)

with open('../data/cim.csv', 'w') as file:
    for date in pd.date_range(start=start_date, end=end_date):
        data = retrieve_CIM_data(date)

        if not data.empty:
            data = data[['id', 'reportType', 'dateResult', 'ranking', 'description', 'category', 'channel', 'startTime', 'rLength', 'ratePerc', 'rateInK', 'shr', 'rateInKAll', 'live']]
            data.to_csv(file, header=(file.tell() == 0), index=False, lineterminator='\n')


## Retrieve all available CIM data since last record


In [4]:
start_date = pd.to_datetime(pd.read_csv('../data/cim.csv', usecols=['dateResult'])['dateResult'].max()) + timedelta(days=1)
end_date = datetime.today() - timedelta(days=31)

with open('../data/cim.csv', 'a') as file:
    for date in pd.date_range(start=start_date, end=end_date):
        data = retrieve_CIM_data(date)

        if not data.empty:
            data = data[['id', 'reportType', 'dateResult', 'ranking', 'description', 'category', 'channel', 'startTime', 'rLength', 'ratePerc', 'rateInK', 'shr', 'rateInKAll', 'live']]
            data.to_csv(file, header=False, index=False, lineterminator='\n')


# Weather Data


In [3]:
def retrieve_weather_data(date):
    cache_session = CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = Client(session = retry_session)

    responses = openmeteo.weather_api(
        "https://archive-api.open-meteo.com/v1/archive",
        params = {
          "latitude": 50.85045,
          "longitude": 4.34878,
          "start_date":date,
          "end_date": f'{(datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')}',
          "daily": ["temperature_2m_max", "apparent_temperature_min", "sunset", "daylight_duration", "sunshine_duration", "wind_speed_10m_max", "wind_gusts_10m_max", "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration", "sunrise", "weather_code", "apparent_temperature_max", "precipitation_sum", "rain_sum", "snowfall_sum", "precipitation_hours", "apparent_temperature_mean", "temperature_2m_min", "temperature_2m_mean"],
          "timezone": "Europe/Berlin",
        }
    )

    rmtree('.cache', ignore_errors=True)
    return responses


## Retrieve all available weather data


In [5]:
responses = retrieve_weather_data("2016-10-01")
for response in responses:
    daily = response.Daily()

    daily_data = {
        "date": pd.date_range(
            start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
            end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
            freq = pd.Timedelta(seconds = daily.Interval()),
            inclusive = "left"
        )
    }
    daily_data["temperature_2m_max"] = daily.Variables(0).ValuesAsNumpy()
    daily_data["apparent_temperature_min"] = daily.Variables(1).ValuesAsNumpy()
    daily_data["sunset"] = daily.Variables(2).ValuesAsNumpy()
    daily_data["daylight_duration"] = daily.Variables(3).ValuesAsNumpy()
    daily_data["sunshine_duration"] = daily.Variables(4).ValuesAsNumpy()
    daily_data["wind_speed_10m_max"] = daily.Variables(5).ValuesAsNumpy()
    daily_data["wind_gusts_10m_max"] = daily.Variables(6).ValuesAsNumpy()
    daily_data["wind_direction_10m_dominant"] = daily.Variables(7).ValuesAsNumpy()
    daily_data["shortwave_radiation_sum"] = daily.Variables(8).ValuesAsNumpy()
    daily_data["et0_fao_evapotranspiration"] = daily.Variables(9).ValuesAsNumpy()
    daily_data["sunrise"] = daily.Variables(10).ValuesAsNumpy()
    daily_data["weather_code"] = daily.Variables(11).ValuesAsNumpy()
    daily_data["apparent_temperature_max"] = daily.Variables(12).ValuesAsNumpy()
    daily_data["precipitation_sum"] = daily.Variables(13).ValuesAsNumpy()
    daily_data["rain_sum"] = daily.Variables(14).ValuesAsNumpy()
    daily_data["snowfall_sum"] = daily.Variables(15).ValuesAsNumpy()
    daily_data["precipitation_hours"] = daily.Variables(16).ValuesAsNumpy()
    daily_data["apparent_temperature_mean"] = daily.Variables(17).ValuesAsNumpy()
    daily_data["temperature_2m_min"] = daily.Variables(18).ValuesAsNumpy()
    daily_data["temperature_2m_mean"] = daily.Variables(19).ValuesAsNumpy()

    file_exists = os.path.exists('../data/weather.csv')
    pd.DataFrame(daily_data).to_csv(
        '../data/weather.csv',
        mode='a' if file_exists else 'w',
        header=not file_exists,
        index=False,
        lineterminator='\n'
    )


## Retrieve all available weather data since last record


In [7]:
last_date = pd.to_datetime(pd.read_csv('../data/weather.csv', usecols=['date'])['date'].max()) + timedelta(days=2)
responses = retrieve_weather_data(last_date.strftime('%Y-%m-%d'))
for response in responses:
    daily = response.Daily()

    daily_data = {
        "date": pd.date_range(
            start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
            end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
            freq = pd.Timedelta(seconds = daily.Interval()),
            inclusive = "left"
        )
    }
    daily_data["temperature_2m_max"] = daily.Variables(0).ValuesAsNumpy()
    daily_data["apparent_temperature_min"] = daily.Variables(1).ValuesAsNumpy()
    daily_data["sunset"] = daily.Variables(2).ValuesAsNumpy()
    daily_data["daylight_duration"] = daily.Variables(3).ValuesAsNumpy()
    daily_data["sunshine_duration"] = daily.Variables(4).ValuesAsNumpy()
    daily_data["wind_speed_10m_max"] = daily.Variables(5).ValuesAsNumpy()
    daily_data["wind_gusts_10m_max"] = daily.Variables(6).ValuesAsNumpy()
    daily_data["wind_direction_10m_dominant"] = daily.Variables(7).ValuesAsNumpy()
    daily_data["shortwave_radiation_sum"] = daily.Variables(8).ValuesAsNumpy()
    daily_data["et0_fao_evapotranspiration"] = daily.Variables(9).ValuesAsNumpy()
    daily_data["sunrise"] = daily.Variables(10).ValuesAsNumpy()
    daily_data["weather_code"] = daily.Variables(11).ValuesAsNumpy()
    daily_data["apparent_temperature_max"] = daily.Variables(12).ValuesAsNumpy()
    daily_data["precipitation_sum"] = daily.Variables(13).ValuesAsNumpy()
    daily_data["rain_sum"] = daily.Variables(14).ValuesAsNumpy()
    daily_data["snowfall_sum"] = daily.Variables(15).ValuesAsNumpy()
    daily_data["precipitation_hours"] = daily.Variables(16).ValuesAsNumpy()
    daily_data["apparent_temperature_mean"] = daily.Variables(17).ValuesAsNumpy()
    daily_data["temperature_2m_min"] = daily.Variables(18).ValuesAsNumpy()
    daily_data["temperature_2m_mean"] = daily.Variables(19).ValuesAsNumpy()

    file_exists = os.path.exists('../data/weather.csv')
    pd.DataFrame(daily_data).to_csv(
        '../data/weather.csv',
        mode='a' if file_exists else 'w',
        header=not file_exists,
        index=False,
        lineterminator='\n'
    )


# CAT Data


In [6]:
def retrieve_CAT_data(url='/api/tv_public_results?page=1'):
    request = Request(
        f'https://api.cim.be{url}',
        headers={'User-Agent': 'Mozilla/5.0'}
    )
    response = urlopen(request)

    return json.loads(response.read().decode('utf-8'))


## Retrieve all available CAT data


In [7]:
data = retrieve_CAT_data()
has_next = data['hydra:view'].get('hydra:next') is not None

with open('../data/cat.csv', 'w', encoding='utf-8') as file:
    while has_next:
        page_data = pd.json_normalize(data['hydra:member'])
        if not page_data.empty:
            page_data = page_data[['id', 'period', 'reportType', 'dateDiff', 'ranking', 'description', 'category', 'channel', 'startTime', 'rLength', 'ratePerc', 'rateInK', 'shr', 'rateInKAll', 'description2', 'live']]
            page_data.to_csv(file, header=(file.tell() == 0), index=False, lineterminator='\n')

        data = retrieve_CAT_data(data['hydra:view']['hydra:next'])
        has_next = data['hydra:view'].get('hydra:next') is not None
