In [9]:
from pathlib import Path
from datetime import datetime, timedelta
from dask import dataframe as dd
import pandas as pd
import functools
import shutil
import os


def csv_name_comparator(file1, file2):
    file1_info = file1.split('_')
    file2_info = file2.split('_')
    
    start_date1 = file1_info[2]
    start_time1 = file1_info[3]
    start_date_time1 = datetime.strptime(start_date1 + ' ' + start_time1, '%d-%m-%Y %H%M%S')
    
    start_date2 = file2_info[2]
    start_time2 = file2_info[3]
    start_date_time2 = datetime.strptime(start_date2 + ' ' + start_time2, '%d-%m-%Y %H%M%S')
    
    if start_date_time1 < start_date_time2:
        return -1
    else:
        return 1

columns = {
  'timestamp': pd.Series(dtype = 'datetime64[ns]'),
  'voltage': pd.Series(dtype = 'float64'),
  'input_current': pd.Series(dtype = 'float64'),
  'output_current': pd.Series(dtype = 'float64'),
  'soc': pd.Series(dtype = 'float64'),
}

OLD_DB_PATH = Path('[...]/solarpanel_db')
NEW_DB_PATH = Path('[...]/solarpanel.parquet')
NEW_TEMP_DB_PATH = Path('[...]/tmp_solarpanel.parquet')


csv_files = [f.name for f in OLD_DB_PATH.iterdir() if f.suffix == '.csv']
csv_files = sorted(csv_files, key=functools.cmp_to_key(csv_name_comparator))

try: 
    shutil.rmtree(NEW_DB_PATH) 
except: pass
NEW_DB_PATH.mkdir(parents=True, exist_ok=True)

for i in range(10):

    for csv_file in csv_files:
        records = []

        with open(str(OLD_DB_PATH) + "/" + csv_file, 'r') as f:
            first_line = True
            for line in f:
                if first_line:
                    first_line = False
                else:
                    line_info = line.split(',')
                    records.append([
                        datetime.fromtimestamp(int(line_info[4])) + timedelta(days=i),
                        float(line_info[0]),
                        float(line_info[1]),
                        float(line_info[2]),
                        float(0)
                    ])

        df = dd.from_pandas(pd.DataFrame(records, columns = columns), npartitions = 1)
        df = df.set_index("timestamp", sorted=True)
        dd.to_parquet(df, NEW_DB_PATH, append=True)


df = dd.read_parquet(NEW_DB_PATH)
df = df.repartition(partition_size="20MB")
dd.to_parquet(df, NEW_TEMP_DB_PATH)

shutil.rmtree(NEW_DB_PATH)
shutil.move(NEW_TEMP_DB_PATH, NEW_DB_PATH)

PosixPath('/home/lenny/projects/SolarPanel/SolarPanel/src/solar_module/cache/solarpanel.parquet')