# Database Aggregation
- So far the way we stored data looks like

```bash
|-- data
|   |-- tag1
|   |   |-- date1
|   |   |   |-- tag1_date1.csv
|   |   |   |-- tag1_date1_location.csv
|   |   |   |-- images
|   |   |   |-- thumbnails
|   |   |-- date2
|   |   |-- date3
|   |-- tag2
|   |-- ..
```

Let's correct them into one database:

```bash
|-- data
|   |-- data.csv
|   |-- images
|   |-- thumbnails
```

In [1]:
import shutil
import os, sys, time, glob, pathlib
import numpy as np
import pandas as pd

PROJECT_ROOT = '../../../project-TT'
sys.path.append(PROJECT_ROOT)
sys.path.append(os.path.join(PROJECT_ROOT, 'backend'))  # backend root

import core
from core.envs import DATA_DIR
DATA_DIR = os.path.join(PROJECT_ROOT, DATA_DIR)

In [12]:
def create_dataset(data_dir):
    """ combine csvs from hierarchical directories into one under `data_dir` """

    # create csv
    data_path = pathlib.Path(data_dir)
    csvs = [k for k in data_path.rglob('*_location.csv')]
    dfs = []
    for csv in csvs:
        df = pd.read_csv(csv, quotechar="'")
        dfs.append(df)
    dataset = pd.concat(dfs, ignore_index=True)
    dataset.to_csv(os.path.join(data_dir, 'data.csv'), quotechar="'", index=False)

    # create directories
    image_path = data_path / 'images'
    thumbnail_path = data_path / 'thumbnails'
    image_path.mkdir(exist_ok=True)
    thumbnail_path.mkdir(exist_ok=True)

    # copy images
    for img in data_path.rglob('images/*.jpeg'):
        if str(img).find(str(image_path)) > -1:
            continue
        shutil.copy2(img, image_path / os.path.basename(img))
    
    # copy thumbnails
    for img in data_path.rglob('thumbnails/*.jpeg'):
        if str(img).find(str(thumbnail_path)) > -1:
            continue
        shutil.copy2(img, thumbnail_path / os.path.basename(img))
    
    return dataset, data_path

In [13]:
dataset, data_path = create_dataset(DATA_DIR)