In [1]:
import os
import io
import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client
import seaborn as sns
import coiled
import dask
import glob

In [2]:
client = Client(n_workers=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:35947  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 36  Memory: 21.47 GB


In [3]:
os.chdir("/data/leuven/351/vsc35102/scratch/Full_Data_Set/")

In [18]:
def process_data_by_month():
    os.chdir("/data/leuven/351/vsc35102/scratch/Full_Data_Set/")

    months = ['Jan', 'Feb', 'March', 'April', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    base_folder = '/data/leuven/351/vsc35102/scratch/Full_Data_Set/'
    output_folder = '/data/leuven/351/vsc35102/scratch/Full_Data_Set/output/'
    os.makedirs(output_folder, exist_ok=True)  # Create output folder if it doesn't exist

    dfs = []  # List to store the individual Pandas dataframes

    for month in months:
        flat_list = glob.glob(os.path.join(base_folder, month, '*_parquet/*.parquet'))
        ddf = dd.read_parquet(flat_list, columns=["#object_id", "description", "result_timestamp", "laeq"])

        # Convert to the desired data types
        ddf = ddf.astype({'description': 'string', 'laeq': 'float'})

        # Set the timestamp into the correct form and set the index to result_timestamp
        ddf["result_timestamp"] = dd.to_datetime(ddf["result_timestamp"], dayfirst=True, format="%d/%m/%Y %H:%M:%S.%f")
        ddf_sorted = ddf.set_index('result_timestamp')

        pandas_df = ddf_sorted.compute().reset_index()  # Convert to Pandas DataFrame

        pandas3 = pandas_df.groupby("description").resample("10T", on="result_timestamp").agg({'laeq': 'mean'}).reset_index()

        dfs.append(pandas3)

    combined_df = pd.concat(dfs)  # Combine all individual Pandas dataframes into a single dataframe
    return combined_df


In [19]:
# Call the function to process the data for each month
table = process_data_by_month()


In [23]:
def add_time_features(df):
    # Create new variables with the date and time
    df['date'] = df['result_timestamp'].dt.date
    df['time'] = df['result_timestamp'].dt.time

    # Add the hour
    df['hour'] = df['result_timestamp'].dt.hour

    # Add the day of the week
    df['weekday'] = df['result_timestamp'].dt.day_name()

    # Add the month
    df['month'] = df['result_timestamp'].dt.month

    # Add whether it's the weekend or not
    df['weekend'] = df['weekday'].isin(['Saturday', 'Sunday'])
    
    return df
                                                         

In [26]:
time_features = add_time_features(table)

In [29]:
holiday_ranges = []

# Define the start and end dates for each holiday
holiday_dates = [
    ('2022-01-01', '2022-01-13'),
    ('2022-02-02', '2022-02-02'),
    ('2022-02-06', '2022-02-13'),
    ('2022-04-02', '2022-04-18'),
    ('2022-05-01', '2022-05-01'),
    ('2022-05-26', '2022-05-26'),
    ('2022-05-28', '2022-06-12'),
    ('2022-07-03', '2022-09-25'),
    ('2022-11-01', '2022-11-02'),
    ('2022-11-11', '2022-11-11'),
    ('2022-12-24', '2022-12-31')
]

# Generate the date ranges for each holiday
for start_date, end_date in holiday_dates:
    holiday_ranges.append(pd.date_range(start=start_date, end=end_date))

holidays = pd.concat([pd.DataFrame(date_range) for date_range in holiday_ranges], ignore_index=True)
holidays.columns = ['dates']

time_features['holiday'] = time_features['date'].astype(str).isin(holidays['dates'].astype(str))



In [34]:
exam_ranges = [
    ('2022-01-14', '2022-02-05'),
    ('2022-06-13', '2022-07-02'),
    ('2022-08-22', '2022-09-10')
]

exam_periods = pd.concat([pd.DataFrame(pd.date_range(start, end)) for start, end in exam_ranges], ignore_index=True)
exam_periods.columns = ['dates']

time_features['exam_period'] = time_features['date'].astype(str).isin(exam_periods['dates'].astype(str))



In [35]:
time_features

Unnamed: 0,description,result_timestamp,laeq,date,time,hour,weekday,month,weekend,holiday,exam_period
0,MP 03: Naamsestraat 62 Taste,2022-01-01 00:00:00,63.267554,2022-01-01,00:00:00,0,Saturday,1,True,True,False
1,MP 03: Naamsestraat 62 Taste,2022-01-01 00:10:00,60.254000,2022-01-01,00:10:00,0,Saturday,1,True,True,False
2,MP 03: Naamsestraat 62 Taste,2022-01-01 00:20:00,55.143907,2022-01-01,00:20:00,0,Saturday,1,True,True,False
3,MP 03: Naamsestraat 62 Taste,2022-01-01 00:30:00,58.230167,2022-01-01,00:30:00,0,Saturday,1,True,True,False
4,MP 03: Naamsestraat 62 Taste,2022-01-01 00:40:00,54.220466,2022-01-01,00:40:00,0,Saturday,1,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...
33022,MP08bis - Vrijthof,2022-12-31 23:10:00,56.973500,2022-12-31,23:10:00,23,Saturday,12,True,True,False
33023,MP08bis - Vrijthof,2022-12-31 23:20:00,58.590000,2022-12-31,23:20:00,23,Saturday,12,True,True,False
33024,MP08bis - Vrijthof,2022-12-31 23:30:00,58.432500,2022-12-31,23:30:00,23,Saturday,12,True,True,False
33025,MP08bis - Vrijthof,2022-12-31 23:40:00,59.046000,2022-12-31,23:40:00,23,Saturday,12,True,True,False
