In [1]:
import os
import io
import pandas as pd

In [2]:
os.getcwd()
os.chdir("/data/leuven/351/vsc35102/scratch")

In [3]:

def read_csv_files(directory):
    """
    Reads the 'description', 'result_timestamp', and 'laeq' columns from all CSV files in the specified directory
    into a Pandas DataFrame.

    Parameters:
        directory (str): The directory path where the CSV files are located.

    Returns:
        A Pandas DataFrame containing only the 'description', 'result_timestamp', and 'laeq' columns from all
        CSV files in the directory.
    """
    # Get a list of all CSV files in the directory
    csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

    # Read only the desired columns from all CSV files into a list of Pandas DataFrames
    dfs = []
    for f in csv_files:
        for chunk in pd.read_csv(f, sep=';', usecols=['description', 'result_timestamp', 'laeq'], chunksize=100000):
            dfs.append(chunk)

    # Concatenate all DataFrames into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)
    
    return df


In [4]:
directory = '/data/leuven/351/vsc35102/scratch/Full Data Set/'
ddf = read_csv_files(directory)

In [9]:
#Set the timestamp
ddf["result_timestamp"] = pd.to_datetime(ddf["result_timestamp"],
                                          dayfirst=True,
                                            format="%d/%m/%Y %H:%M:%S.%f"
                                            )


In [22]:
#Group by minute
grouped_df = ddf.groupby(["description"]).resample("1T", on="result_timestamp").mean().reset_index()



In [25]:
grouped_df['day_of_week'] = grouped_df['result_timestamp'].dt.day_name()


In [28]:
#Enter exam seasons 

def create_exam_season(start_dates, end_dates, period_names):
    # Create a list of DataFrames for each exam period
    periods = []
    for i in range(len(start_dates)):
        period_df = pd.DataFrame({'date': pd.date_range(start_dates[i], end_dates[i]), 'exam_period': period_names[i]})
        periods.append(period_df)
    
    # Concatenate the DataFrames into a single DataFrame
    exam_season = pd.concat(periods, ignore_index=True)
    return exam_season


In [29]:

start_dates = ['2022-01-13', '2022-06-09', '2022-08-18']
end_dates = ['2022-02-05', '2022-07-04', '2022-09-10']
period_names = ['first', 'second', 'third']

exam_season = create_exam_season(start_dates, end_dates, period_names)


In [35]:
start_dates = ['2022-01-01', '2022-04-02', '2022-07-05']
end_dates = ['2022-01-09', '2022-04-18', '2022-10-26']
period_names = ['winter', 'easter', 'summer']

holiday_season = create_exam_season(start_dates, end_dates, period_names)

In [36]:
# create a new column 'exams' with 1 for exam season and 0 for no exam season
grouped_df['exams'] = grouped_df['result_timestamp'].isin(exam_season['date']).astype(int)
grouped_df['holidays'] = grouped_df['result_timestamp'].isin(holiday_season['date']).astype(int)

In [37]:
grouped_df

Unnamed: 0,description,result_timestamp,laeq,day_of_week,exams,holidays
0,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:23:00,61.132353,Monday,0,0
1,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:24:00,51.596667,Monday,0,0
2,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:25:00,52.070000,Monday,0,0
3,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:26:00,58.123333,Monday,0,0
4,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:27:00,71.656667,Monday,0,0
...,...,...,...,...,...,...
3287651,MP08bis - Vrijthof,2022-12-31 23:55:00,58.951667,Saturday,0,0
3287652,MP08bis - Vrijthof,2022-12-31 23:56:00,59.753333,Saturday,0,0
3287653,MP08bis - Vrijthof,2022-12-31 23:57:00,60.853333,Saturday,0,0
3287654,MP08bis - Vrijthof,2022-12-31 23:58:00,61.958333,Saturday,0,0
