# Libraries

In [1]:
import sys
import os
import inspect
import pandas as pd
# Resolves one level up to an absolute path 
# And adds this directory to sys.path
sys.path.append(os.path.abspath("..")) 

***

# Data_cleaning.py

In [2]:
import functions_laura.Data_cleaning as dc

We look at all functions in the script

In [None]:
function_names = [
    name for name, func in inspect.getmembers(dc, inspect.isfunction)
    if func.__module__ == dc.__name__
]
function_names

['clean_stations_file',
 'concatenate_station_info_city',
 'concatenate_station_info_zipcode',
 'data_cleaning_datetime',
 'datetime_format',
 'extract_brand',
 'get_clean_brand']

`clean_stations_file()`

is used to clean the stations file and is a part of the 

`concatenate_station_info_city`and `concatenate_station_info_zipcode`

but can be used individually

In [None]:
df = dc.clean_stations_file('data/stations.csv')

`concatenate_station_info_city`and `concatenate_station_info_zipcode`

are used to create the raw data per city

In [None]:
df_city = dc.concatenate_station_info_city('data/stations.csv', 'data/prices', 'city_name')
df_zip = dc.concatenate_station_info_zipcode('data/stations.csv', 'data/prices', 'zipcode')

`get_clean_brand`extracts a clean brand name from the brand or the name column.
The recognized brands are: 

aral, shell, esso, total, avia, jet, star, agip, raiffeisen, bft, oil!, sb

When the brand does not fit one of the brands in the list, then it is categorized as "other".

In [3]:
df = pd.read_parquet('../data/fuerstenwalde_raw.parquet')

In [4]:
df['brand_clean'] =df.apply(dc.extract_brand, axis=1)

In [4]:
df = df[df['e5'].notna()]

`datetime_format` created a datetime column from the date column without the timezone info

`data_cleaning_datetime`combines the datetime format and does some basic cleaning:
* removes entries with missing values
* makes a datetime column

When `drop = True`
* drops the columns: 'dieselchange', 'e5change', 'e10change', 'date', 'openingtimes_json'

In [5]:
df = dc.data_cleaning_datetime(df, drop=True)

In [6]:
df.head()

Unnamed: 0,station_uuid,name,brand,street,house_number,post_code,city,latitude,longitude,first_active,openingtimes_json,date,diesel,e5,e10,dieselchange,e5change,e10change,datetime
0,f97bcef8-619c-4ac4-bf9d-35995dea884e,Aral Tankstelle,ARAL,Triftstraße,36,15517,Fürstenwalde,52.378353,14.073492,2014-03-18 16:45:31+01,{},2023-01-01 06:31:07+01,1.999,1.879,1.819,1.0,0.0,0.0,2023-01-01 06:31:07
1,f97bcef8-619c-4ac4-bf9d-35995dea884e,Aral Tankstelle,ARAL,Triftstraße,36,15517,Fürstenwalde,52.378353,14.073492,2014-03-18 16:45:31+01,{},2023-01-01 08:01:08+01,1.859,1.849,1.789,1.0,1.0,1.0,2023-01-01 08:01:08
2,f97bcef8-619c-4ac4-bf9d-35995dea884e,Aral Tankstelle,ARAL,Triftstraße,36,15517,Fürstenwalde,52.378353,14.073492,2014-03-18 16:45:31+01,{},2023-01-01 08:05:08+01,1.839,1.829,1.769,1.0,1.0,1.0,2023-01-01 08:05:08
3,f97bcef8-619c-4ac4-bf9d-35995dea884e,Aral Tankstelle,ARAL,Triftstraße,36,15517,Fürstenwalde,52.378353,14.073492,2014-03-18 16:45:31+01,{},2023-01-01 08:40:13+01,1.839,1.819,1.759,0.0,1.0,1.0,2023-01-01 08:40:13
4,f97bcef8-619c-4ac4-bf9d-35995dea884e,Aral Tankstelle,ARAL,Triftstraße,36,15517,Fürstenwalde,52.378353,14.073492,2014-03-18 16:45:31+01,{},2023-01-01 09:48:09+01,1.899,1.879,1.819,1.0,1.0,1.0,2023-01-01 09:48:09


***
# Plotting

In [7]:
import functions_laura.Plotting as plot

function_names = [
    name for name, func in inspect.getmembers(plot, inspect.isfunction)
    if func.__module__ == plot.__name__
]
function_names

['api_map',
 'average_prices',
 'daily_pattern',
 'decompose_and_plot',
 'one_station',
 'print_scattermap']

`print_scattermap` prints a plotly scatterap of locations. There is the option to use brands or not.

In [None]:
fig = plot.print_scattermap(df) # without branding
fig = plot.print_scattermap(df, brand=True) # with branding
fig.show()

In [9]:
fig = plot.daily_pattern(df)

In [11]:
fig.update_layout(
    xaxis=dict(
        title=dict(
            text="Time",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    yaxis=dict(
        title=dict(
            text="Price in €/l",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    legend_title="Brand",
    legend_title_font_size=16,
    legend_font_size=14,
    template="plotly_white",
    shapes=[
        # 7:20 line
        dict(
            type="line",
            x0="07:20", x1="07:20",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 9:50 line
        dict(
            type="line",
            x0="09:50", x1="09:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 12:50 line
        dict(
            type="line",
            x0="12:50", x1="12:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 21:50 line
        dict(
            type="line",
            x0="21:50", x1="21:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        )
    ]
)
fig.show()

In [10]:
fig = plot.daily_pattern(df, brand=True)

In [11]:
fig.update_layout(
    xaxis=dict(
        title=dict(
            text="Time",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    yaxis=dict(
        title=dict(
            text="Price in €/l",
            font=dict(size=20)
        ),
        tickfont=dict(size=18)
    ),
    legend_title="Brand",
    legend_title_font_size=16,
    legend_font_size=14,
    template="plotly_white",
    shapes=[
        # 7:20 line
        dict(
            type="line",
            x0="07:20", x1="07:20",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 9:50 line
        dict(
            type="line",
            x0="09:50", x1="09:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 12:50 line
        dict(
            type="line",
            x0="12:50", x1="12:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        ),
        # 21:50 line
        dict(
            type="line",
            x0="21:50", x1="21:50",
            y0=0, y1=1,
            yref="paper",
            line=dict(color="#ff8019", width=2)
        )
    ]
)
fig.show()

# Resampling

In [9]:
import functions_laura.Data_processing as dp

function_names = [
    name for name, func in inspect.getmembers(dp, inspect.isfunction)
    if func.__module__ == dp.__name__
]
function_names

['add_seasonal', 'resample', 'resample_with_seasonality', 'train_test_split']

In [10]:
df_resampled = dp.resample(df)

In [15]:
df_seasonal = dp.resample_with_seasonality(df)

In [16]:
df_seasonal.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252265 entries, 1 to 252265
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   datetime            252265 non-null  datetime64[ns]
 1   seasonal_component  252264 non-null  float64       
 2   e5                  252265 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 7.7 MB
