In [None]:
import requests
import datetime as dt
import pytz
import pandas as pd
import pytz
from dateutil import rrule

In [None]:
def fetch_ediel(date):
    """
    Indexis has gas caloric values per GOS in an EDIEL file per month. This method fetches such a file.
        
    Parameters
    ----------
    date
    
    Returns
    -------
    file
    """
    
    url = 'https://www.indexis.be/allocationWeb/cbw/download.action?year={}&month={}'.format(date.year, date.month)
    r = requests.get(url, verify=False)
    if r.status_code == 200:
        return r.text
    else:
        raise requests.HTTPError(r.status_code)

In [None]:
class Ediel(object):
    def __init__(self, text):
        # read CSV file and save raw line per line
        lines = text.splitlines()
        self.raw = [line.split(';') for line in lines]

        try:
            self._parse_raw()
        except UnboundLocalError: # Expected arguments in Ediel message are not found.
            raise ValueError('Ediel message was not in expected format')
        
        data = [self._clean_content(line) for line in self.body_raw]
        df = pd.DataFrame(data, columns=['date', 'EAN', 'GOS', 'CBW'])
        df.date = [dt.datetime(year=int(d[2:6]), month=int(d[0:2]), day=1) for d in df.date]
        df = df.set_index('date')
        df = df.tz_localize('Europe/Brussels')
        df.CBW = [float(s.replace(',', '.')) for s in df.CBW]
        self.df = df

    def get_attributes(self):
        """
        Returns a list with attributes that have been assigned to the object

        Returns
        -------
        list
        """
        # return only attributes that don't start with '_'
        return [attr for attr in dir(self) if not attr.startswith('_')]

    def _parse_raw(self):
        """
        Parse self.raw into attributes
        """
        for line_number, line in enumerate(self.raw):
            if not line[0].startswith('['):
                continue

            property_name = self._clean_name(line[0])

            if property_name == 'body_start':
                body_start_line = line_number + 1
                continue
            elif property_name == 'body_end':
                continue

            content = self._clean_content(line[1:])
            if content is None:
                continue

            self.__setattr__(property_name, content)

        body = self.raw[body_start_line: body_start_line+int(self.number_of_lines_in_body)]
        self.__setattr__('body_raw', body)

    @staticmethod
    def _clean_name(name):
        """
        Make attribute names more pythonic by removing square brackets, capital letters and periods,
        replacing spaces by underscores

        Parameters
        ----------
        name : String

        Returns
        -------
        String
        """
        return name[1:-1].lower().replace(" ", "_").replace(".", "")

    @staticmethod
    def _clean_content(content):
        # get rid of the trailing empty string
        if content[-1] == '':
            res = content[:-1]
        else:
            res = content

        if len(res) == 1:
            res = res[0] if res[0] != '' else None
        elif len(res) == 0:
            res = None
        return res

In [None]:
def monthset(start, end):
    """
        Takes a start and end date and returns a set containing all months between start and end

        Parameters
        ----------
        start : datetime-like object
        end : datetime-like object

        Returns
        -------
        set of datetime objects
    """

    res = []
    for day in rrule.rrule(rrule.MONTHLY, dtstart=start, until=end):
        res.append(day)
    return set(res)

In [None]:
def fetch_cbw(start, end):
    """
    Fetches, caches and returns CBW data as a Pandas DataFrame
    Since it is historical data, we cache and append, so we don't have to do too much external calls
    
    Parameters
    ---------
    start : dt.date
    end : dt.date
    
    Returns
    -------
    Pandas DataFrame
    """
    # we create a set of all months that we want
    months = monthset(start=start, end=end)
    months = set(pd.DatetimeIndex(months).tz_localize('Europe/Brussels'))

    # we try to load the cached version of the cbw data
    cache_name = 'cbw_cache.pkl'
    try:
        cached = pd.read_pickle(cache_name)
    except:
        cached = pd.DataFrame()
        cached_months = set()
    else:
        # we make a set of all months that are already in the cached version
        cached_months = set(cached.truncate(before=start, after=end).index)
    
    # we take the difference of both sets, now we know if there is new data we need to fetch
    to_fetch = months.difference(cached_months)
    
    # we fetch new data and append to the cached data
    if len(to_fetch) > 0:
        for date in to_fetch:
            try:
                ediel = Ediel(text=fetch_ediel(date=date))
            except ValueError: # No valid Ediel message found
                print('No valid CBW data fetched for date {}'.format(date))
            else:
                cached = cached.append(ediel.df)
        cached = cached.sort_index()
        # save the updated file
        cached.to_pickle(cache_name)
        
    return cached.truncate(before=start, after=end)

In [None]:
start=dt.datetime(year=2015, month=1, day=1)
end=dt.datetime(year=2016, month=4, day=1)

In [None]:
df = fetch_cbw(start=start, end=end)