## Why
As part of the project of De-Googling myself I had to [export](https://takeout.google.com/settings/takeout?pli=1) my memories from Google Photos and load it back to my Synology. However, exports have:
1. metadata that I might want to drop
1. inconsistent names (my library spans many smartphone generations with Android and iOS)
1. directories have weird names (a consequence of file segmentation and a 2GB download limit)
Basically I want to automate this with a script

In [None]:
import os
import glob
import shutil
import pandas as pd
from shutil import copyfile
pd.options.display.max_rows = 1000
pd.set_option('max_colwidth', 100)

    |-- basedir
    |   |-- 2016-01-01
    |   |   |-- filename-1.jpg
    |   |   |-- filename-2.jpg.json
    |   |   |-- filename-3.jpg
    |   |   |-- filename-3.jpg.json
    |   |-- 2016-01-02


In [None]:
# Directory containing the export (extracted archive)
base_path = f'/Users/lorismarini/Desktop/google-photos-elisa-takeout'

# Directory for the output (files in folders)
output_path = f'/Users/lorismarini/Desktop/google-photos-elisa-to-synology'

# Search for all files recursively
files_all = [name for name in glob.glob(f"{base_path}/**/*.*", recursive=True)]

# Filter out all json files
files_keep = [name for name in files_all if not name.endswith(".json")]

# Extract basenames of files to keep
basenames_keep = [os.path.basename(f) for f in files_keep]

In [None]:
# Build a dataframe
df = pd.DataFrame({"abspath_src":files_keep, "basenames":basenames_keep})

# Determine if the first two characters of the basename are letters
df["is_alpha"] = df["basenames"].apply(lambda x: x[:2].isalpha())

# --------------- PARSE TIME --------------------

def timestring_starts_number(n):
    """Parse time for filenames starting with numbers
    """
    return n.split("_")[0]

def timestring_starts_alpha(n):
    """Parse time for filenames starting with letters
    """
    if "-" in n:
        parts = n.split("-")
    elif "_" in n:
        parts = n.split("_")
    else:
        return ""
    if len(parts)>1:
        return parts[1]
    else:
        return parts[0]
    
# Parse for is_alpha = True
alpha = df["is_alpha"]==True
df.loc[alpha, "time"] = pd.to_datetime(df.loc[alpha, "basenames"].apply(timestring_starts_alpha), errors="coerce")

# Parse for is_alpha = False
where = df["is_alpha"]==False
df.loc[where, "time"] = pd.to_datetime(df.loc[where, "basenames"].apply(timestring_starts_number), errors="coerce")

# Folder name from datetime
df["basedir_dest"] = df["time"].dt.date.astype(str)

# Dirname destination path
df["dirname_dest"] = df["basedir_dest"].apply(lambda x: os.path.join(output_path, x))

# Abspath destination
df["abspath_dest"] = df["dirname_dest"] + "/" + df["basenames"]

files_to_move = df[~df["time"].isnull()].shape[0]
files_ignored = df[df["time"].isnull()].shape[0]

print(f"{files_to_move} files to move...")
print(f"{files_ignored} files ingored...")

In [None]:
df_tomove

In [None]:
df_tomove = df[~df["time"].isnull()]

# Create destination directories if they don't exist
dirname_dest_unique = df_tomove["dirname_dest"].unique()
_ = [os.makedirs(d) for d in dirname_dest_unique]
print(f"{len(dirname_dest_unique)} directories consolidated...")

In [None]:
moved = 0
already_moved = 0
lost = 0
for i in df_tomove.index:

    # Move file
    src = df_tomove.loc[i,"abspath_src"]
    dst = df_tomove.loc[i,"abspath_dest"]
    
    if os.path.exists(src):
        # Move file
        shutil.move(src, dst)
        moved += 1
    elif os.path.exists(dst):
        # File already moved skip
        already_moved += 1
        pass
    else:
        lost += 1
        print(f"DATA INTEGRITY ERROR, File not found in src {src} or dst {dst}!")
        
print(f"{moved} moved")
print(f"{already_moved} already moved")
print(f"{lost} lost")