#### Imports

In [None]:
import os
import pandas as pd
import date_range_data_extractor

## Setup environment

In [None]:
Y = '2020'
quoter = 2
last = False

if quoter == 1:
    Q = ['Q1','01-01','03-31']
elif quoter == 2:
    Q = ['Q2','04-01','06-30']
elif quoter == 3:
    Q = ['Q3','07-01','09-30']
elif quoter == 4:
    Q = ['Q4','10-01','12-31']

from_date = f'{Y}-{Q[1]}' # ex: '2021-01-01'
to_date = f'{Y}-{Q[2]}' # ex: '2021-03-31'
output_filename = f'{Y}-{Q[0]}_seconds.csv' # ex: '2021-Q1_seconds.csv'

## Read data

In [None]:
# Extract data from csv files.
data_extractor = date_range_data_extractor.DateRangeDataExtractor()
data_extractor.extract_data(r'./files/', from_date, to_date)
data = data_extractor.data

Concatenate data and show info

In [None]:
df = pd.concat(data, ignore_index=True, join='inner')
print(df.info(), end='\n\n')

## Scale data
Change the Time value to datetime format and filter the data to whole seconds and remove all other values.

In [None]:
df['Time'] = pd.to_datetime(df['Time'])
df = df[df['Time'].dt.microsecond == 0]
df.reset_index(drop=True, inplace=True)
print(df.head(5))

Add timezone to data

In [None]:
df['Time'] = pd.to_datetime(df['Time']).dt.tz_localize('Europe/Helsinki', ambiguous='infer')
print(df.head(5))

Add extra hour

In [None]:
if quoter == 4:
    data_extractor_extra = date_range_data_extractor.DateRangeDataExtractor()
    data_extractor_extra.extract_data(r'./files/', f'{str(int(Y) + 1)}-01-01', f'{str(int(Y) + 1)}-01-01')
    extra_data = data_extractor_extra.data
    edf = pd.concat(extra_data, ignore_index=True, join='inner')
    edf['Time'] = pd.to_datetime(edf['Time'])
    edf = edf[edf['Time'].dt.microsecond == 0]
    edf.reset_index(drop=True, inplace=True)
    edf['Time'] = pd.to_datetime(edf['Time']).dt.tz_localize('Europe/Helsinki', ambiguous='infer')
    edf = edf[edf['Time'] <= f'{str(int(Y) + 1)}-01-01 01:00:00']
    df = pd.concat([df, edf])
    df.reset_index(drop=True, inplace=True)
else:
    print('Skipping!')

In [None]:
df.info()

## Analyze integrity of data

#### Check and drop duplicates if any

In [None]:
duplicated_rows = df[df.duplicated()]
num_duplicated = len(duplicated_rows)
print(f'Duplicates: \n{num_duplicated}\n{duplicated_rows}', end='\n\n\n')

Remove duplicates

In [None]:
df = df.drop_duplicates()

##### Find duplicates in date

In [None]:
duplicated_rows_time = df[df.duplicated(subset="Time", keep=False)]
num_duplicated_time = len(duplicated_rows_time)
print(f'Duplicates in Time column: \n{num_duplicated_time}\n{duplicated_rows_time}', end='\n')

Remove duplicates

In [None]:
df = df.drop_duplicates(subset='Time')

#### Any NaN, Null, 0 or "" found in Value.

In [None]:
dfNaN = df[df['Value'].isna()]
nrNaN = len(dfNaN)
dfNull = df[df['Value'].isnull()]
nrNull = len(dfNull)
dfZeroValues = df[df['Value'] == 0]
zeroValues = len(dfZeroValues)
dfWhiteSpaces = df[df['Value'] == ""]
whiteSpaces = len(dfWhiteSpaces)
print(f'Is not a number in value column:: \n{nrNaN}', end='\n')
print(f'Is a NULL in value column:: \n{nrNull}', end='\n')
print(f'Zero values in value column: \n{zeroValues}', end='\n')
print(f'White spaces in value column: \n{whiteSpaces}', end='\n')

#### Finding any missing date in the series of dates

In [None]:
df.set_index('Time', inplace=True)

In [None]:
from_date_filter = pd.to_datetime(from_date).tz_localize('Europe/Helsinki')

if quoter == 4:
    to_date_filter = f'{str(int(Y) + 1)}-01-01 01:00:00+02:00'
    to_date_filter = pd.to_datetime(to_date_filter).tz_convert('Europe/Helsinki')
else:
    to_date_filter = pd.to_datetime(to_date).tz_localize('Europe/Helsinki')
    to_date_filter = to_date_filter.replace(hour=23, minute=59, second=59)

print(from_date_filter,'\n',to_date_filter)

In [None]:
date_range = pd.date_range(start=from_date_filter, end=to_date_filter, freq='S')
missingDates = date_range[~date_range.isin(df.index)].value_counts().sum()

df = df.reindex(date_range)
print(f'Number of missing dates: {missingDates}', end='\n\n\n')
df.reset_index(inplace=True, names="Time")
df['Value'].fillna(-1, inplace=True)
print(df[df['Value']==-1].head(5))

In [None]:
if missingDates == (df[df['Value'] == -1].value_counts().sum()):
    print('Test Ok')
else:
    print('Test Failed')

## Convert to Swedish time

In [None]:
df['Time'] = df['Time'].dt.tz_convert('Europe/Stockholm')
print(df.head(5))
print(df.tail(5))

In [None]:
from_date_filter = pd.to_datetime(from_date).tz_localize('Europe/Stockholm')
to_date_filter = pd.to_datetime(to_date).tz_localize('Europe/Stockholm')
to_date_filter = to_date_filter.replace(hour=23, minute=59, second=59)
if quoter == 1:
    df = df[(df['Time'] >= from_date_filter) & (df['Time'] <= to_date_filter)]
else:
    df = df[df['Time'] <= to_date_filter]
print(df.head(5))
print(df.tail(5))

### Test

In [None]:
if df.Value.count() == len(date_range) :
    print('Test passed')
else :
    print(df.Value.count() - len(date_range), '\nOne hour will always be removed if your data contains January (3600 seconds).')

## Output

#### Save the data to a new csv

In [None]:
folder_name = 'processed_files'
file_name = output_filename

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, file_name)
df.to_csv(file_path, index=False)

#### Save to logfile

In [None]:
folder_name = 'log'
file_name = 'log.csv'
file_path = os.path.join(folder_name, file_name)

columns = ['Index', 'DateFrom', 'DateTo', 'NULL', 'NaN', 'Exact duplicates', 'Time duplicates', 'Zero Values', 'White Space', 'Added missing dates']

new_data = {'Index': output_filename, 'DateFrom': from_date, 'DateTo': to_date, 'NULL': nrNull, 'NaN': nrNaN, 'Exact duplicates': num_duplicated, 'Time duplicates': num_duplicated_time, 'Zero Values': zeroValues, 'White Space': whiteSpaces, 'Added missing dates': missingDates}

if not os.path.exists(file_path):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    log_df = pd.DataFrame(columns=columns)
else:
    log_df = pd.read_csv(file_path)

index_exists = (log_df['Index'] == new_data['Index']).any()

if index_exists:
    log_df.loc[log_df['Index'] == new_data['Index']] = [new_data[col] for col in columns]
else:
    new_row = pd.DataFrame([new_data], columns=columns)
    log_df = pd.concat([log_df, new_row], ignore_index=True)

log_df.to_csv(file_path, index=False)

In [None]:
display(log_df)