#### Imports

In [86]:
import os
import pandas as pd
import date_range_data_extractor

## Setup environment

In [87]:
Y = '2023'
quoter = 3

if quoter == 1:
    Q = ['Q1','01-01','03-31']
elif quoter == 2:
    Q = ['Q2','04-01','06-30']
elif quoter == 3:
    Q = ['Q3','07-01','09-30']
elif quoter == 4:
    Q = ['Q4','10-01','12-31']

from_date = f'{Y}-{Q[1]}' # ex: '2021-01-01'
to_date = f'{Y}-{Q[2]}' # ex: '2021-03-31'
output_filename = f'{Y}-{Q[0]}_seconds.csv' # ex: '2021-Q1_seconds.csv'

## Read data

In [88]:
# Extract data from csv files.
data_extractor = date_range_data_extractor.DateRangeDataExtractor()
data_extractor.extract_data(r'./files/', from_date, to_date)
data = data_extractor.data

Loading file 2023-07-01.csv...
Loading file 2023-07-02.csv...
Loading file 2023-07-03.csv...
Loading file 2023-07-04.csv...
Loading file 2023-07-05.csv...
Loading file 2023-07-06.csv...
Loading file 2023-07-07.csv...
Loading file 2023-07-08.csv...
Loading file 2023-07-09.csv...
Loading file 2023-07-10.csv...
Loading file 2023-07-11.csv...
Loading file 2023-07-12.csv...
Loading file 2023-07-13.csv...
Loading file 2023-07-14.csv...
Loading file 2023-07-15.csv...
Loading file 2023-07-16.csv...
Loading file 2023-07-17.csv...
Loading file 2023-07-18.csv...
Loading file 2023-07-19.csv...
Loading file 2023-07-20.csv...
Loading file 2023-07-21.csv...
Loading file 2023-07-22.csv...
Loading file 2023-07-23.csv...
Loading file 2023-07-24.csv...
Loading file 2023-07-25.csv...
Loading file 2023-07-26.csv...
Loading file 2023-07-27.csv...
Loading file 2023-07-28.csv...
Loading file 2023-07-29.csv...
Loading file 2023-07-30.csv...
Loading file 2023-07-31.csv...
Loading file 2023-08-01.csv...
Loading 

Concatenate data and show info

In [89]:
df = pd.concat(data, ignore_index=True, join='inner')
print(df.info(), end='\n\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52389144 entries, 0 to 52389143
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   Time    object 
 1   Value   float64
dtypes: float64(1), object(1)
memory usage: 799.4+ MB
None


## Scale data
Change the Time value to datetime format and filter the data to whole seconds and remove all other values.

In [90]:
df['Time'] = pd.to_datetime(df['Time'])
df = df[df['Time'].dt.microsecond == 0]
df.reset_index(drop=True, inplace=True)
print(df.head(20))

                  Time     Value
0  2023-07-01 00:00:00  50.07763
1  2023-07-01 00:00:01  50.07575
2  2023-07-01 00:00:02  50.07244
3  2023-07-01 00:00:03  50.07040
4  2023-07-01 00:00:04  50.07193
5  2023-07-01 00:00:05  50.06990
6  2023-07-01 00:00:06  50.07419
7  2023-07-01 00:00:07  50.07456
8  2023-07-01 00:00:08  50.07545
9  2023-07-01 00:00:09  50.08005
10 2023-07-01 00:00:10  50.08031
11 2023-07-01 00:00:11  50.08071
12 2023-07-01 00:00:12  50.08263
13 2023-07-01 00:00:13  50.08107
14 2023-07-01 00:00:14  50.07819
15 2023-07-01 00:00:15  50.07428
16 2023-07-01 00:00:16  50.06272
17 2023-07-01 00:00:17  50.05571
18 2023-07-01 00:00:18  50.05046
19 2023-07-01 00:00:19  50.04753


Add timezone to data

In [91]:
df['Time'] = pd.to_datetime(df['Time']).dt.tz_localize('Europe/Helsinki', ambiguous='infer')

## Analyze integrity of data

#### Check and drop duplicates if any

In [92]:
duplicated_rows = df[df.duplicated()]
num_duplicated = len(duplicated_rows)
print(f'Duplicates: \n{num_duplicated}\n{duplicated_rows}', end='\n\n\n')

Duplicates: 
8
                             Time     Value
3601    2023-07-01 01:00:00+03:00  49.92183
7202    2023-07-01 02:00:00+03:00  50.04779
10803   2023-07-01 03:00:00+03:00  50.07014
14404   2023-07-01 04:00:00+03:00  49.94890
2574519 2023-08-01 01:00:00+03:00  50.10892
2578120 2023-08-01 02:00:00+03:00  50.05931
2581721 2023-08-01 03:00:00+03:00  50.01276
2585322 2023-08-01 04:00:00+03:00  49.90219



Remove duplicates

In [93]:
df = df.drop_duplicates()

##### Find duplicates in date

In [94]:
duplicated_rows_time = df[df.duplicated(subset="Time", keep=False)]
num_duplicated_time = len(duplicated_rows_time)
print(f'Duplicates in Time column: \n{num_duplicated_time}\n{duplicated_rows_time}', end='\n')

Duplicates in Time column: 
0
Empty DataFrame
Columns: [Time, Value]
Index: []


Remove duplicates

In [95]:
df = df.drop_duplicates(subset='Time')

#### Any NaN, Null, 0 or "" found in Value.

In [96]:
dfNaN = df[df['Value'].isna()]
nrNaN = len(dfNaN)
dfNull = df[df['Value'].isnull()]
nrNull = len(dfNull)
dfZeroValues = df[df['Value'] == 0]
zeroValues = len(dfZeroValues)
dfWhiteSpaces = df[df['Value'] == ""]
whiteSpaces = len(dfWhiteSpaces)
print(f'Is not a number in value column:: \n{nrNaN}', end='\n')
print(f'Is a NULL in value column:: \n{nrNull}', end='\n')
print(f'Zero values in value column: \n{zeroValues}', end='\n')
print(f'White spaces in value column: \n{whiteSpaces}', end='\n')

Is not a number in value column:: 
0
Is a NULL in value column:: 
0
Zero values in value column: 
0
White spaces in value column: 
0


#### Finding any missing date in the series of dates

In [97]:
from_date = pd.to_datetime(from_date).tz_localize('Europe/Helsinki')
to_date = pd.to_datetime(to_date).tz_localize('Europe/Helsinki')
date_range = pd.date_range(start=from_date, end=to_date, freq='S')

df.set_index('Time', inplace=True)
df = df.reindex(date_range)
missingDates = df[df['Value'].isna()].shape[0]
print(f'Number of missing dates: {missingDates}', end='\n\n\n')
df.reset_index(inplace=True, names="Time")
df['Value'].fillna(-1, inplace=True)
print(df.head(20))

Number of missing dates: 2623493


                        Time     Value
0  2023-07-01 00:00:00+03:00  50.07763
1  2023-07-01 00:00:01+03:00  50.07575
2  2023-07-01 00:00:02+03:00  50.07244
3  2023-07-01 00:00:03+03:00  50.07040
4  2023-07-01 00:00:04+03:00  50.07193
5  2023-07-01 00:00:05+03:00  50.06990
6  2023-07-01 00:00:06+03:00  50.07419
7  2023-07-01 00:00:07+03:00  50.07456
8  2023-07-01 00:00:08+03:00  50.07545
9  2023-07-01 00:00:09+03:00  50.08005
10 2023-07-01 00:00:10+03:00  50.08031
11 2023-07-01 00:00:11+03:00  50.08071
12 2023-07-01 00:00:12+03:00  50.08263
13 2023-07-01 00:00:13+03:00  50.08107
14 2023-07-01 00:00:14+03:00  50.07819
15 2023-07-01 00:00:15+03:00  50.07428
16 2023-07-01 00:00:16+03:00  50.06272
17 2023-07-01 00:00:17+03:00  50.05571
18 2023-07-01 00:00:18+03:00  50.05046
19 2023-07-01 00:00:19+03:00  50.04753


Cell below to filter out time in a range

In [98]:
# start_time = '2021-12-12 06:00:00'
# end_time = '2021-12-12 07:00:00'
# start_time = pd.to_datetime(start_time).tz_localize('Europe/Helsinki')
# end_time = pd.to_datetime(end_time).tz_localize('Europe/Helsinki')
# filtered_data_time = filtered_df[(filtered_df['Time'] >= start_time) & (filtered_df['Time'] <= end_time)]
# print(filtered_data_time.head(20))

## Convert to Swedish time

In [99]:
df['Time'] = df['Time'].dt.tz_convert('Europe/Stockholm')
print(df.head(20))

                        Time     Value
0  2023-06-30 23:00:00+02:00  50.07763
1  2023-06-30 23:00:01+02:00  50.07575
2  2023-06-30 23:00:02+02:00  50.07244
3  2023-06-30 23:00:03+02:00  50.07040
4  2023-06-30 23:00:04+02:00  50.07193
5  2023-06-30 23:00:05+02:00  50.06990
6  2023-06-30 23:00:06+02:00  50.07419
7  2023-06-30 23:00:07+02:00  50.07456
8  2023-06-30 23:00:08+02:00  50.07545
9  2023-06-30 23:00:09+02:00  50.08005
10 2023-06-30 23:00:10+02:00  50.08031
11 2023-06-30 23:00:11+02:00  50.08071
12 2023-06-30 23:00:12+02:00  50.08263
13 2023-06-30 23:00:13+02:00  50.08107
14 2023-06-30 23:00:14+02:00  50.07819
15 2023-06-30 23:00:15+02:00  50.07428
16 2023-06-30 23:00:16+02:00  50.06272
17 2023-06-30 23:00:17+02:00  50.05571
18 2023-06-30 23:00:18+02:00  50.05046
19 2023-06-30 23:00:19+02:00  50.04753


## Output

#### Save the data to a new csv

In [100]:
folder_name = 'processed_files'
file_name = output_filename

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, file_name)
df.to_csv(file_path, index=False)

#### Save to logfile

In [101]:
folder_name = 'log'
file_name = 'log.csv'
file_path = os.path.join(folder_name, file_name)

columns = ['Index', 'DateFrom', 'DateTo', 'NULL', 'NaN', 'Exact duplicates', 'Time duplicates', 'Zero Values', 'White Space', 'Added missing dates']

new_data = {'Index': output_filename, 'DateFrom': from_date, 'DateTo': to_date, 'NULL': nrNull, 'NaN': nrNaN, 'Exact duplicates': num_duplicated, 'Time duplicates': num_duplicated_time, 'Zero Values': zeroValues, 'White Space': whiteSpaces, 'Added missing dates': missingDates}

if not os.path.exists(file_path):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    df = pd.DataFrame(columns=columns)
else:
    df = pd.read_csv(file_path)

index_exists = (df['Index'] == new_data['Index']).any()

if index_exists:
    df.loc[df['Index'] == new_data['Index']] = [new_data[col] for col in columns]
else:
    new_row = pd.DataFrame([new_data], columns=columns)
    df = pd.concat([df, new_row], ignore_index=True)

df.to_csv(file_path, index=False)

In [102]:
display(df)

Unnamed: 0,Index,DateFrom,DateTo,NULL,NaN,Exact duplicates,Time duplicates,Zero Values,White Space,Added missing dates
0,2021-Q1_seconds.csv,2021-01-01 00:00:00+02:00,2021-03-31 00:00:00+03:00,0,0,12,0,0,0,8299
1,2021-Q2_seconds.csv,2021-04-01 00:00:00+03:00,2021-06-30 00:00:00+03:00,0,0,11,2,0,0,9500
2,2021-Q3_seconds.csv,2021-07-01 00:00:00+03:00,2021-09-30 00:00:00+03:00,0,0,9,6,0,0,3206
3,2021-Q4_seconds.csv,2021-10-01 00:00:00+03:00,2021-12-31 00:00:00+02:00,0,0,12,0,0,0,1762
4,2022-Q1_seconds.csv,2022-01-01 00:00:00+02:00,2022-03-31 00:00:00+03:00,0,0,10,4,0,0,128083
5,2022-Q2_seconds.csv,2022-04-01 00:00:00+03:00,2022-06-30 00:00:00+03:00,0,0,5,14,0,0,2847
6,2022-Q3_seconds.csv,2022-07-01 00:00:00+03:00,2022-09-30 00:00:00+03:00,0,0,8,8,0,0,3164
7,2022-Q4_seconds.csv,2022-10-01 00:00:00+03:00,2022-12-31 00:00:00+02:00,0,0,10,4,0,0,1808
8,2023-Q1_seconds.csv,2023-01-01 00:00:00+02:00,2023-03-31 00:00:00+03:00,0,0,12,0,0,0,1364
9,2023-Q2_seconds.csv,2023-04-01 00:00:00+03:00,2023-06-30 00:00:00+03:00,0,0,12,0,0,0,3049
