#### Imports

In [None]:
import os
import pandas as pd
import date_range_data_extractor

## Setup environment

In [None]:
from_date = '2023-07-01' # ex: '2021-01-01'
to_date = '2023-09-30' # ex: '2021-03-31'
output_filename = '2023-Q3_seconds.csv' # ex: '2021-Q1_seconds.csv'

## Read data

In [None]:
# Extract data from csv files.
data_extractor = date_range_data_extractor.DateRangeDataExtractor()
data_extractor.extract_data(r'./files/', from_date, to_date)
data = data_extractor.data

Concatenate data show info and check for null or NaN values

In [None]:
df = pd.concat(data, ignore_index=True, join='inner')
print(df.info(), end='\n\n')
print(f'Is not a number: \n{df.isna().sum()}', end='\n\n\n')
print(f'Is a NULL: \n{df.isnull().sum()}', end='\n\n\n')

## Scale data
Change the Time value to datetime format and filter the data to whole seconds and remove all other values.

In [None]:
df['Time'] = pd.to_datetime(df['Time'])
df = df[df['Time'].dt.microsecond == 0]
df.reset_index(drop=True, inplace=True)
print(df.head(20))

Check and drop duplicates if any

In [None]:
duplicated_rows = df.duplicated()
print(f'Duplicates: \n{df[duplicated_rows]}', end='\n\n\n')

In [None]:
df = df.drop_duplicates()

Find duplicates in date

In [None]:
duplicated_rows_time = df.duplicated(subset="Time", keep=False)
print(f'Duplicates in Time column: \n{df[duplicated_rows_time]}', end='\n\n\n')

In [None]:
df = df.drop_duplicates(subset='Time')

Finding any missing date in the series of dates

In [None]:
date_range = pd.date_range(start=from_date, end=to_date, freq='S')

df.set_index('Time', inplace=True)
df = df.reindex(date_range)
print(f'Number of missing dates: {df[df.index.isna()].shape[0]}', end='\n\n\n')
df.reset_index(inplace=True, names="Time")
df['Value'].fillna(0, inplace=True)
print(df.head(20))

##### Cell below to filter out time in a range

In [None]:
# start_time = '2021-12-12 06:00:00'
# end_time = '2021-12-12 07:00:00'
# filtered_data_time = filtered_df[(filtered_df['Time'] >= start_time) & (filtered_df['Time'] <= end_time)]
# print(filtered_data_time.head(20))

##### Save the data to a new csv

In [None]:
folder_name = 'processed_files'
file_name = output_filename

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, file_name)
df.to_csv(file_path, index=False)