# Database Aggregation
- So far the way we stored data looks like

```bash
|-- data
|   |-- tag1
|   |   |-- date1
|   |   |   |-- tag1_date1.csv
|   |   |   |-- tag1_date1_location.csv
|   |   |   |-- images
|   |   |   |-- thumbnails
|   |   |-- date2
|   |   |-- date3
|   |-- tag2
|   |-- ..
```

Let's collect them into one database! The data attribute should contain all the information already that we shouldn't need any special hierarchical directories. The new directory should simply look like:

```bash
|-- data
|   |-- data.csv
|   |-- images
|   |-- thumbnails
```

In [None]:
import shutil
import os, sys, time, glob, pathlib
import numpy as np
import pandas as pd

PROJECT_ROOT = '../../../project-TT'
sys.path.append(PROJECT_ROOT)
sys.path.append(os.path.join(PROJECT_ROOT, 'backend'))  # backend root

import core
from core.envs import DATA_DIR
DATA_DIR = os.path.join(PROJECT_ROOT, DATA_DIR)

In [None]:
def aggregate_dataset(data_dir):
    """ combine csvs from hierarchical directories into one under `data_dir`

    Args:
      data_dir (str): the directory for data. This should be
      `/path/to/project/project_TT/data`

    """

    # create dataframe from all csvs
    data_path = pathlib.Path(data_dir)
    csvs = [k for k in data_path.rglob('*_location.csv')]
    dfs = []
    for csv in csvs:
        df = pd.read_csv(csv, quotechar="'")
        dfs.append(df)
    dataset = pd.concat(dfs, ignore_index=True)
    
    target_csv = data_path / 'data.csv'

    # if data.csv already exists, update the table
    if target_csv.exists():
        existing_ids = pd.read_csv(target_csv, quotechar="'").media_id
        new_data = dataset[~dataset.media_id.isin(existing_ids)]
        new_data.to_csv(
            target_csv, quotechar="'",
            mode='a', header=False, index=False)
        print(f'Appended {len(new_data)} elements to data.csv')
    else :
        existing_ids = []
        new_data = dataset
        new_data.to_csv(target_csv, quotechar="'", index=False)
        print('Created data.csv')

    # create directories
    image_path = data_path / 'images'
    thumbnail_path = data_path / 'thumbnails'
    image_path.mkdir(exist_ok=True)
    thumbnail_path.mkdir(exist_ok=True)

    # copy images
    existing_ids = list(existing_ids)
    for img in data_path.rglob('images/*.jpeg'):
        img_id = os.path.basename(img).split('.')[0]
        if img_id in existing_ids:
            continue
        if str(img).find(str(image_path)) > -1:
            continue
            
        shutil.copy2(img, image_path / os.path.basename(img))

    # copy thumbnails
    for img in data_path.rglob('thumbnails/*.jpeg'):
        img_id = os.path.basename(img).split('.')[0]
        if img_id in existing_ids:
            continue
        if str(img).find(str(thumbnail_path)) > -1:
            continue
            
        shutil.copy2(img, thumbnail_path / os.path.basename(img))

    return dataset, data_path
