# **DataMergeScript.ipynb**

Author: Zhixian Yang

Email: [yangzhx28@mail2.sysu.edu.cn](mailto:yangzhx28@mail2.sysu.edu.cn) or [yimu01439@gmail.com](mailto:yimu01439@gmail.com)

GitHub: [https://github.com/koar-create](https://github.com/koar-create)

Date created: August 4th, 2023

Last modified: August 10th, 2023

<br><br>

---

<br><br>

## **Description**
None.

In [10]:
import os, sys, time, glob
import pytz
import platform
import numpy as np, pandas as pd
from datetime import datetime


def DataMergeScript(dirname=None):
    data_dirname = os.path.join(dirname, "data")
    result_data_dirname = os.path.join(data_dirname, "result")
    
    abspaths = glob.glob(os.path.join(data_dirname, '*.csv')) # grasp files
    df = pd.read_csv(abspaths[0])
    for abspath in abspaths:
        df = pd.concat([df, pd.read_csv(abspath)], axis=0)
    print(f"shape of df is {df.shape}.")
    
    result_abspaths = glob.glob(os.path.join(result_data_dirname, "*.csv"))
    result_fnames = [abspath.split(os.sep)[-1] for abspath in result_abspaths]
    final_record_dates = [datetime.strptime(fname.split('.')[0], '%y-%m-%d_%H%M%S') for fname in result_fnames]
    df0 = pd.read_csv(result_abspaths[final_record_dates.index(max(final_record_dates))])
    previous_length = df.shape[0]
    df = pd.concat([df, df0], axis=0)

    # remove duplicate rows
    df.drop_duplicates(subset=['number', 'year', 'month', 'day', 'hour'], keep='first', inplace=True)
    print(f"{previous_length + df0.shape[0] - df.shape[0]} records are remove. After removing duplicate row, the shape of df is {df.shape}.")

    # str to int64, sort
    for label in ['number', 'year', 'month', 'day', 'hour']:
        df[label] = df[label].astype(np.int64)
    df.sort_values(by=['number', 'year', 'month', 'day', 'hour'], ascending=[True, True, True, True, True], inplace=True)

    # reset all indices again.
    df.reset_index(inplace=True)
    df.drop(columns=['index'], inplace=True)
    
    return df, abspaths, result_abspaths

def savemergedDataFrame(df=None, dirname=None):
    result_data_dirname = os.path.join(dirname, 'data', 'result')
    record_date = datetime.now(pytz.timezone('Asia/Shanghai'))
    save_filename = f"{record_date.strftime('%y-%m-%d_%H%M00')}.csv"
    if not os.path.exists(result_data_dirname):
        os.makedirs(result_data_dirname)
    df.to_csv(os.path.join(result_data_dirname, save_filename), index=False)
    print(f"Successfully save as {save_filename}.")

def removeDuplicates(abspaths=[], intervalday=3, result_abspaths=[], mode='preview'):
    count = 0
    for abspath in abspaths:
        china_timezone = pytz.timezone('Asia/Shanghai')
        datetime_now = datetime.now(china_timezone)
        record_date = china_timezone.localize(datetime.strptime(abspath.split(os.sep)[-1].split('.')[0].split('-')[-1], '%y%m%d%H'))
        if (datetime_now - record_date).total_seconds() >= 86400 * intervalday:
            count += 1
            if mode == 'delete':
                os.remove(abspath)
    if mode == 'preview':
        print(f"Done. {count} csv files will be cleaned. ")
    elif mode == 'delete':
        print(f"Done. {count} csv files are cleaned. ")
    
    final_record_dates = [datetime.strptime(abspath.split(os.sep)[-1].split('.')[0], '%y-%m-%d_%H%M%S') for abspath in result_abspaths]
    if mode == 'delete':
        while len(result_abspaths) > 2:
            idx = final_record_dates.index(min(final_record_dates))
            os.remove(result_abspaths[idx])
            print(f"{result_abspaths[idx]} is removed.")
            result_abspaths.remove(result_abspaths[idx])
            final_record_dates.remove(final_record_dates[idx])
    elif mode == 'preview':
        print(f"{len(result_abspaths) - 2} records will be removed. ")
    

# main program
if platform.system() == 'Linux':
    dirname = os.getcwd()
elif platform.system() == 'Windows':
    dirname = os.path.join("D:\\Documents", "A-threads", "less important ones", "thread2308-4_try_to_purchase_chinese_station_api")
data_dirname = os.path.join(dirname, "data")

df, abspaths, result_abspaths = DataMergeScript(dirname=dirname)
savemergedDataFrame(df=df, dirname=dirname)
# abspaths = glob.glob(os.path.join(data_dirname, '*.csv')) # grasp files
# result_abspaths = glob.glob(os.path.join(data_dirname, 'result', '*.csv')) # grasp files
removeDuplicates(abspaths=abspaths, intervalday=1, result_abspaths=result_abspaths, mode='delete')


Done. 0 csv files are cleaned. 
/home/jovyan/1/data/result/23-08-09_162000.csv is removed.
