# What's in this Notebook?

Use this to merge a different dataset into the existing.  
Important: the dataset should be structured the same and the csv files should have the same cols.

The zip should contain the directories `dataset` and `dataset_metadata`.

Important:  
The random state does not get merged. The current random state stays the same.  
Copy the random state manually if you want to take the new one

# Imports

In [None]:
from ml_project.utils import files,paths,data

In [None]:
import shutil

In [None]:
import zipfile

In [None]:
import pandas as pd
import geopandas as gpd

# Definitions

In [None]:
dataset_zip_path = paths.DATA / 'data_to_merge.zip'
temp_data_dir = paths.BASE / 'temp_data'

In [None]:
# to use ml_project.utils.paths
# but with temp_data_dir instead of paths.DATA
def get_temp_path(original_path, temp_data_dir):
    return temp_data_dir / original_path.relative_to(paths.DATA)

# Extract the dataset zip

In [None]:
temp_data_dir.mkdir()

In [None]:
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(temp_data_dir)

# Load dataframes

In [None]:
points_df = files.load_points_df()
samples_df = files.load_samples_df()

In [None]:
new_points_path = get_temp_path(paths.FILE_POINTS,temp_data_dir)
new_samples_path = get_temp_path(paths.FILE_SAMPLES,temp_data_dir)

In [None]:
new_points_df = files.read_csv_as_geodataframe(new_points_path,crs=data.CRS_GEODETIC)
new_samples_df = files.read_csv_as_geodataframe(new_samples_path,crs=data.CRS_GEODETIC)

# Merge dataframes

In [None]:
max_batch_id = points_df['batch_id'].max()
max_id = points_df['id'].max()
max_batch_id, max_id

In [None]:
new_points_df['batch_id'] += max_batch_id + 1
new_points_df['id'] += max_id + 1
new_samples_df['id'] += max_id + 1

In [None]:
columns_to_check_samples = ['tile_x','tile_y','tile_z']
columns_to_check_points = ['geometry']

In [None]:
merged_samples_df = samples_df.append(new_samples_df).drop_duplicates(subset=columns_to_check_samples)
merged_points_df = points_df.append(new_points_df).drop_duplicates(subset=columns_to_check_points)

In [None]:
files.save_samples_df(merged_samples_df,override=True)
#files.save_points_df(merged_points_df,override=True)

# Copy the Images and the Vector Tiles

In [None]:
old_images_path = str(paths.IMAGES)
new_images_path = str(get_temp_path(paths.IMAGES,temp_data_dir))

In [None]:
!cp -n -a {new_images_path}/. {old_images_path}/

In [None]:
old_vector_tiles_path = str(paths.VECTOR_TILES)
new_vector_tiles_path = str(get_temp_path(paths.VECTOR_TILES,temp_data_dir))

In [None]:
!cp -n -a {new_vector_tiles_path}/. {old_vector_tiles_path}/

# Remove the temporary directory

In [None]:
shutil.rmtree(temp_data_dir)

# How many new samples?

In [None]:
before = samples_df.shape[0]
after = merged_samples_df.shape[0]
in_new_dataset = new_samples_df.shape[0]

In [None]:
print(f'Added Samples: {after-before}')
print(f'Dropped New Samples (because duplicate): {(before+in_new_dataset) - after}')