# Jellyfin Migrator - Adjusts your Jellyfin database to run on a new system. # Copyright (C) 2022 Max Zuidberg # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import pathlib import sqlite3 import json import hashlib import binascii import xml.etree.ElementTree as ET from pathlib import Path from shutil import copy from time import time from jellyfin_id_scanner import \* import datetime from string import ascii_letters import os # TODO BEFORE YOU START: # \* Create a copy of the jellyfin database you want to migrate # \* Delete the following temp/cache folders (resp. the matching # folders for your installation) # \* C:/ProgramData/Jellyfin/Server/cache # \* C:/ProgramData/Jellyfin/Server/log # \* C:/ProgramData/Jellyfin/Server/data/subtitles # Note: this only contains _cached_ subtitles that have been # extracted on-the-fly from files streamed to clients. # \* RTFM (read the README.md) and you're ready to go. # \* Careful when replacing everything in your new installation, # you might want to _not_ copy your old network settings # (C:/ProgramData/Jellyfin/Server/config/networking.xml) # Please specify a log file. The script is rather verbose and important # errors might get lost in the process. You should definitely check the # log file after running the script to see if there are any warnings or # other important messages! Use f.ex. notepad++ (npp) to quickly # highlight and remove bunches of uninteresting log messages: # \* Open log file in npp # \* Go to "Search -> Mark... (CTRL + M)" # \* Tick "Bookmark Line" # \* Search for strings that (only!) occur in the lines you want to # remove. All those lines should get a marking next to the line number. # \* Go to "Search -> Bookmark -> Remove Bookmarked Lines" # \* Repeat as needed # Text encoding is UTF-8 (in npp selectable under "Encoding -> UTF-8") log_file = "C:/Users/Stanislas/jf-migrator.log" # These paths will be processed in the order they're listed here. # This can be very important! F.ex. if specific subfolders go to a different # place than stuff in the root dir of a given path, the subfolders must be # processed first. Otherwise, they'll be moved to the same place as the other # stuff in the root folder. # Note: all the strings below will be converted to Path objects, so it doesn't # matter whether you write / or \\ or include a trailing / . After the path # replacement it will be converted back to a string with slashes as specified # by target_path_slash. # Note2: The AppDataPath and MetadataPath entries are only there to make sure # the script recognizes them as actual paths. This is necessary to adjust # the (back)slashes as specified. This can only be done on "known" paths # because (back)slashes occur in other strings, too, where they must not be # changed. path_replacements = { # Self-explanatory, I guess. "\\" if migrating _to_ Windows, "/" else. "target_path_slash": "/", # Paths to your libraries "D:/Films": "/media/Films", "D:/Séries": "/media/Séries", # Paths to the different parts of the jellyfin database. Determine these # by comparing your existing installation with the paths in your new # installation. "C:/ProgramData/Jellyfin/Server/config": "/config/config", "C:/ProgramData/Jellyfin/Server/cache": "/cache", "C:/ProgramData/Jellyfin/Server": "/config", # everything else: metadata, plugins, ... "C:/Program Files/Jellyfin/Server/ffmpeg.exe": "usr/lib/jellyfin-ffmpeg/ffmpeg", "%MetadataPath%": "%MetadataPath%", "%AppDataPath%": "%AppDataPath%", } # This additional replacement dict is required to convert from the paths docker # shows to jellyfin back to the actual file system paths to figure out where # the files shall be copied. If relative paths are provided, the replacements # are done relative to target_root. # # Even if you're not using docker or not using path mapping with docker, # you probably do need to add some entries for accessing the media files # and appdata/metadata files. This is because the script must read all the # file creation and modification dates _as seen by jellyfin_. # In that case and if you're sure that this list is 100% correct, # _and only then_ you can set "log_no_warnings" to True. Otherwise your logs # will be flooded with warnings that it couldn't find an entry to modify the # paths (which in that case would be fine because no modifications are needed). # # If you actually don't need any of this (f.ex. running the script in the # same environment as jellyfin), remove all entries except for # \* "log_no_warnings" (again, can be set to true if you're sure) # \* "target_path_slash" # \* %AppDataPath% # \* %MetadataPath%. fs_path_replacements = { "log_no_warnings": False, "target_path_slash": "/", "/config": "/", "%AppDataPath%": "/data", "%MetadataPath%": "/metadata", "/media/Séries": "Y:/", "/media/Films": "Z:/", } # Original root only needs to be filled if you're using auto target paths _and_ # if your source dir doesn't match the source paths specified above in # path_replacements. # auto target will first replace source_root with original_root in a given path # and then do the replacement according to the path_replacements dict. # This is required if you copied your jellyfin DB to another location and then # start processing it with this script. original_root = Path("C:/ProgramData/Jellyfin/Server") source_root = Path("C:/Users/Stanislas/Téléchargements/Server") target_root = Path("C:/Users/Stanislas/Téléchargements/NewData") ### The To-Do Lists: todo_list_paths, todo_list_id_paths and todo_list_ids. # If your installation is like mine, you don't need to change the following three todo_lists. # They contain which files should be modified and how. # The migration is a multistep process: # 1. Specified files are copied to the new location according to the path changes listed above # 2. All paths within those files are updated to match the new location # 3. The IDs that are used internally and are derived from the paths are updated # 1. They occur in jellyfins file paths, so these paths are updated both on the disk and in the databases. # 2. All remaining occurences of any IDs are updated throughout all files. # 4. Now that all files are where and how they should be, update the file creation and modification # dates in the database. # todo_list_paths is used for step 1 and 2 # todo_list_id_paths is used for step 3.1 # todo_list_ids is used for step 3.2 # table and columns for step 4 are hardcoded / determined automatically. # # General Notes: # \* For step 1, "path_replacements" is used to determine the new file paths. # \* In step 2, the "replacements" from the todo_list is used, but it makes no sense to set it # to something different from what you used in step 1. # \* In step 3 the "replacements" entry in the todo_lists is auto-generated, no need to touch it either. # # Notes from my own jellyfin installation: # 3.1 seems to be "ancestor-str" and "ancestor" formatted IDs only (see jellyfin_id_scanner for details on the format) # 3.2 seems like only certain .db files contain them. # Search for "ID types occurring in paths" to find the place in the code # where you can select the types to include. todo_list_paths = [ { "source": source_root / "data/library.db", "target": "auto", # Usually you want to leave this on auto. If you want to work on the source file, set it to the same path (YOU SHOULDN'T!). "replacements": path_replacements, # Usually same for all but you could specify a specific one per db. "tables": { "TypedBaseItems": { # Name of the table within the SQLite database file "path_columns": [ # All column names that can contain paths. "path", ], "jf_image_columns": [ # All column names that can jellyfins "image paths mixed with image properties" strings. "Images", ], "json_columns": [ # All column names that can contain json data with paths. "data", ], }, "mediastreams": { "path_columns": [ "Path", ], }, "Chapters2": { "jf_image_columns": [ "ImagePath", ], }, }, }, { "source": source_root / "data/jellyfin.db", "target": "auto", "replacements": path_replacements, "tables": { "ImageInfos": { "path_columns": [ "Path", ], }, }, }, # Copy all other .db files. Since it's copy-only (no path adjustments), omit the log output. { "source": source_root / "data/\*.db", "target": "auto", "replacements": path_replacements, "copy_only": True, "no_log": True, }, { "source": source_root / "plugins/**/*.json", "target": "auto", "replacements": path_replacements, }, { "source": source_root / "config/*.xml", "target": "auto", "replacements": path_replacements, }, { "source": source_root / "metadata/**/*.nfo", "target": "auto", "replacements": path_replacements, }, { # .xml, .mblink, .collection files are here. "source": source_root / "root/**/*.*", "target": "auto", "replacements": path_replacements, }, { "source": source_root / "data/collections/**/collection.xml", "target": "auto", "replacements": path_replacements, }, { "source": source_root / "data/playlists/**/playlist.xml", "target": "auto", "replacements": path_replacements, }, # Lastly, copy anything that's left. Any file that's already been processed/copied is skipped # ... you should delete the cache and the logs though. { "source": source_root / "**/*.*", "target": "auto", "replacements": path_replacements, "copy_only": True, "no_log": True, }, ] # See comment from todo_list_paths for details about this todo_list. # "replacements" designates the source -> target path replacement dict. # Same as for the matching job in todo_list_paths. # The ID replacements are determined automatically. todo_list_id_paths = [ { "source": source_root / "data/library.db", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. "tables": { "TypedBaseItems": { # Name of the table within the SQLite database file "path_columns": [ # All column names that can contain paths. "path", ], "jf_image_columns": [ # All column names that can jellyfins "image paths mixed with image properties" strings. "Images", ], "json_columns": [ # All column names that can contain json data with paths OR IDs!! "data", ], }, "mediastreams": { "path_columns": [ "Path", ], }, "Chapters2": { "jf_image_columns": [ "ImagePath", ], }, }, }, { "source": source_root / "config/*.xml", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. }, { "source": source_root / "metadata/**/*", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. }, { # .xml, .mblink, .collection files are here. "source": source_root / "root/**/*", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. }, { "source": source_root / "data/**/*", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. }, ] # See comment from todo_list_paths for details about this todo_list. # "replacements" designates the source -> target path replacement dict. # The ID replacements are determined automatically. todo_list_ids = [ { "source": source_root / "data/library.db", "target": "auto-existing", # If you used "auto" in todo_list_paths, leave this on "auto-existing". Otherwise specify same path. "replacements": {"oldids": "newids"}, # Will be auto-generated during the migration. "tables": { "AncestorIds": { "str": [], "str-dash": [], "ancestor-str": [ "AncestorIdText", ], "ancestor-str-dash": [], "bin": [ "ItemId", "AncestorId", ], }, "Chapters2": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [], "bin": [ "ItemId", ], }, "ItemValues": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [], "bin": [ "ItemId", ], }, "People": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [], "bin": [ "ItemId", ], }, "TypedBaseItems": { "str": [], "str-dash": [], "ancestor-str": [ "TopParentId", "PresentationUniqueKey", "SeriesPresentationUniqueKey", ], "ancestor-str-dash": [ "UserDataKey", "ExtraIds", ], "bin": [ "guid", "ParentId", "SeasonId", "SeriesId", "OwnerId" ], }, "UserDatas": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [ "key", ], "bin": [], }, "mediaattachments": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [], "bin": [ "ItemId", ], }, "mediastreams": { "str": [], "str-dash": [], "ancestor-str": [], "ancestor-str-dash": [], "bin": [ "ItemId", ], }, }, } ] # Since library.db will be needed throughout the process, its location is stored # here once it's been moved and updated with the new paths. library_db_target_path = Path() library_db_source_path = Path() # Similarly, the IDs are used in "hard-to-reach" places and are thus global, too. ids = dict() # Custom print function that prints to both the console as well as to a log file logging_newline = False def print_log(*args, \*\*kwargs): global log_file, logging_newline print(*args, \*\*kwargs) # Each new line gets a timestamp. That requires tracking of (previous) # line endings though. This is not perfect, but perfectly fine for this # script. dt = "" if logging_newline: dt = "[" + datetime.datetime.now().isoformat(sep=" ") + "] " if kwargs.get("end", "\n") == "\n": logging_newline = True else: logging_newline = False with open(log_file, "a", encoding="utf-8") as f: print(dt, *args, **kwargs, file=f) # Recursively replace all paths in "d" which can be # \* a path object # \* a path string # \* a dictionary (only values are checked, no keys). # \* a list # \* any nested structure of the above. # \* anything else is returned unmodified. # Returns the (un)modified object as well as how many items have been modified or ignored. def recursive*root_path_replacer(d, to_replace: dict): modified, ignored = 0, 0 if type(d) is dict: for k, v in d.items(): d[k], mo, ig = recursive_root_path_replacer(v, to_replace) modified += mo ignored += ig elif type(d) is list: for i, e in enumerate(d): d[i], mo, ig = recursive_root_path_replacer(e, to_replace) modified += mo ignored += ig elif type(d) is str or isinstance(d, pathlib.PurePath): try: p = Path(d) except: # This actually doesn't occur I think; Path() can pretty much convert any string into a Path # object (which is equivalent to saying it doesn't have any restrictions for filenames). ignored += 1 else: found = False for src, dst in to_replace.items(): if p.is_relative_to(src): # This filters out all the "garbage" paths that actually were no paths to begin with # and of course all the paths that are actually not relative to the src, dst couple # currently checked. p = dst / p.relative_to(src) # I guess 99% of the users won't migrate \_to* windows but the script could generate # \ paths anyways. # p.as_posix() makes sure that we always get a string with "/". Otherwise, on windows, # str(p) would automatically return "\" paths. d = p.as_posix().replace("/", to_replace["target_path_slash"]) found = True break if found: modified += 1 else: ignored += 1 # No need to consider all the Path("sometext") objects. This might not be 100% # accurate, but it eliminates 99.9999% of the false-positives. This output is # after all only to give you a hint whether you missed a path. # Also exclude URLs. Btw: pathlib can be quite handy for messing with URLs. if len(p.parents) > 1 \ and not d.startswith("https:") \ and not d.startswith("http:") \ and not to_replace.get("log_no_warnings", False): print_log(f"No entry for this (presumed) path: {d}") return d, modified, ignored # Almost the same as recursive_root_path_replacer but for replacing id parts somewhere in # the paths including file names (can't use "is_relative_to" for checking). # ID paths usually have the format '.../83/833addde992893e93d0572907f8b4cad/...'. It's # important to note and change that parent folder with the firs byte of the id, too. # Sometimes the parent folder is just single digit. This code handles any subsring that # starts at the beginning of the id string. def recursive_id_path_replacer(d, to_replace: dict): modified, ignored = 0, 0 if type(d) is dict: for k, v in d.items(): d[k], mo, ig = recursive_id_path_replacer(v, to_replace) modified += mo ignored += ig elif type(d) is list: for i, e in enumerate(d): d[i], mo, ig = recursive_id_path_replacer(e, to_replace) modified += mo ignored += ig elif type(d) is str or isinstance(d, pathlib.PurePath): try: p = Path(d) except: # This actually doesn't occur I think; Path() can pretty much convert any string into a Path # object (which is equivalent to saying it doesn't have any restrictions for filenames). ignored += 1 else: found = False src, dst = "", "" if set(p.stem).issubset(set("0123456789abcdef-")): dst = to_replace.get(p.stem, "") if dst: found = True p = p.with_stem(dst) if not found: for part in p.parts[:-1]: # Check if it can actually be an ID. If so, look it up (which is expensive). if set(part).issubset(set("0123456789abcdef-")): src = part dst = to_replace.get(part, "") if dst: break if dst: found = True q = Path() # Find folder as path object that needs to be changed q = p while p.name != src: p = p.parent # q becomes the part relative to the now determined p part (with p.stem = id) q = q.relative_to(p) p = p.with_name(dst) # Check if the parent folder starts with byte(s) from the id if src.startswith(p.parent.name): # If so, move the already replaced part from p to q q = p.name / q p = p.parent # Replace required number of bytes p = p.with_name(dst[:len(p.name)]) # Merge q and p back together p = p / q if found: modified += 1 # I guess 99% of the users won't migrate _to_ windows but the script could generate # \ paths anyways. # p.as_posix() makes sure that we always get a string with "/". Otherwise, on windows, # str(p) would automatically return "\" paths. d = p.as_posix().replace("/", to_replace["target_path_slash"]) else: ignored += 1 # Unlike recursive_root_path_replacer, there is no need to warn the user about # potential paths that haven't been altered. In case you suspect that something is # overlooked, check out jellyfin_id_scanner.py. # ignored is purely maintained for signature compatibility with recursive_root_path_replacer. return d, modified, ignored def update_db_table( file, replace_dict, replace_func, table, path_columns=(), json_columns=(), jf_image_columns=(), preview=False ): # Initialize local variables rows_count, modified, ignored = 0, 0, 0 # Initialize sqlite3 objects con = sqlite3.connect(file) cur = con.cursor() # If only one item has been specified, convert it to a list with one item instead. if type(path_columns) not in (tuple, set, list): path_columns = [path_columns] if type(json_columns) not in (tuple, set, list): json_columns = [json_columns] if type(jf_image_columns) not in (tuple, set, list): jf_image_columns = [jf_image_columns] # This index will be used to separate the json from the path columns in the cur.execute # result further below. json_stop = len(json_columns) path_stop = json_stop + len(path_columns) # For the sql query the desired row names should be enclosed in ` ` and comma separated. # It's important to note that the json columns come first, followed by the path columns columns = ", ".join([f"`{e}`" for e in list(json_columns) + list(path_columns)] + list(jf_image_columns)) # Query the unique IDs of all rows. Note: we cannot iterate over the rows using # for row in cur.execute(get rows) # because the rows are modified by the loop, which breaks that iterator. Hence # the solution with reading all row ids and iterating over them instead. # Note: The cur.execute yields tuples with all the columns queried. Which means that # the array below actually contains _tuples_ with the id. This is however desirable # in our case; see below where id is used. todo = [rowid for rowid in cur.execute(f"SELECT `rowid` FROM `{table}`") if rowid[0]] rows_count = len(todo) t = time() for progress, id in enumerate(todo): # Print the progress every second. Note: this is the only usage of the "progress" variable. now = time() if now - t > 1: print_log(f"Progress: {progress} / {rows_count} rows") t = now # Query the columns we want to check/modify of the current row (selected by id). # Since the id is a binary object, it's not directly included in the f-string. # The cur.execute expects as second argument a _tuple_ with as many elements as # there are ? characters in the query string. This is the reason why we kept the # IDs as tuple. The only other place where this id is used is in the update query # at the end of the loop which requires - just like here - a tuple. row = [r for r in cur.execute(f"SELECT {columns} FROM `{table}` WHERE `rowid` = ?", id)] # This _should_ not occur, but I think I have seen it happen rarely. Safe is safe. if len(row) != 1: print_log(f"Error with rowid {id}! Resulted in {len(row)} rows instead of 1. Skipping.") continue # cur.execute returns a 2D tuple, containing all rows matching the query, and then # in each row the selected columns. We only selected a single row, hence row[0] is # all we care about (and all there is, see error handling above). # Secondly we want row to be modifiable, hence the conversion to a list. # list(row[0]) would btw return a list with 1 element: the tuple of the columns. row = [e for e in row[0]] # result has the structure {column_name: updated_data} which makes it very easy to build # the update query at the end. result = dict() # It's important to note that the tuple from cur.execute contains the columns _in the order # of the query string_. Therefore, we can separate json and path entries like this. jsons = row[:json_stop] paths = row[json_stop:path_stop] jf_imgs = row[path_stop:] for i, data in enumerate(jsons): if data: # There are numerous rows that have empty columns which would result in an error # from json.loads. Just skip them data = json.loads(data) data, mo, ig = replace_func(data, replace_dict) modified += mo ignored += ig result[json_columns[i]] = json.dumps(data) for i, path in enumerate(paths): # One could also skip the empty objects here, but recursive_path_replacer handles them # just fine (leaves them untouched). path, mo, ig = replace_func(path, replace_dict) modified += mo ignored += ig result[path_columns[i]] = path for i, imgs in enumerate(jf_imgs): # Jellyfin Image Metadata. Some DB entries look like this: # %MetadataPath%\library\71\71d037e6e74015a5a6231ce1b7912acf\poster.jpg*637693022742223153*Primary*198*198*eJC5#hK#Dj9GR/V@j]xuX8NG0x+xgN%MxaX7spNGnitQ$kK0wyV@Rj # noqa # Yeah. That's a path and some other data within the same string, separated by *. More specifically: # path * last modified date * image type * width * height * blur hash # where width, height, blur hash are apparently optional. # In theory, the * could occur as normal character within regular paths but it's unlikely. # Oh, and did I mention that such strings can contain multiple of these structures separated by a | ? # Source (Jellyfin Server 10.7.7): DeserializeImages, AppendItemImageInfo: # https://github.com/jellyfin/jellyfin/blob/045761605531f98c55f379ac9eb5b5b6004ef670/Emby.Server.Implementations/Data/SqliteItemRepository.cs#L1118 # noqa if not imgs: continue imgs = imgs.split("|") for j, img_properties in enumerate(imgs): if not img_properties: continue img_properties = img_properties.split("*") # path = first property img_properties[0], mo, ig = replace_func(img_properties[0], replace_dict) imgs[j] = "*".join(img_properties) modified += mo ignored += ig imgs = "|".join(imgs) result[jf_image_columns[i]] = imgs # Similar to the initial query we construct a comma separated list of the columns, only this # time we write # `columnname` = ? # While the new values are all strings, the question mark avoids any issues with handling # backslashes etc. The library offers an easy, built-in way to do it so there's no reason # to mess with it myself. # Note that this relies on result.keys() and result.values() returning the entries in the # same order (which is guaranteed). # Note: it can happen that no changes are made at all. In this case we can abort here and # go for the next job from the todo_list. if not result: continue keys = ", ".join([f"`{k}` = ?" for k in result.keys()]) query = f"UPDATE `{table}` SET {keys} WHERE `rowid` = ?" # The query has a question mark for each updated column plus one for the id to identify # the correct row. args = tuple(result.values()) + id try: cur.execute(query, args) except Exception as e: # This was mainly for debugging purposes and shouldn't be reached anymore. Doesn't # hurt to have it though. print_log("Error:", e) print_log("Query:", query) print_log("Args: ", args) print_log(e) exit() else: if cur.rowcount < 1: # This was mainly for debugging purposes and shouldn't be reached anymore. # Doesn't hurt to have it though. print_log("No data modified!") print_log("Query:", query) print_log("Args: ", args) exit() print_log(f"Processed {rows_count} rows in table {table}. ") print_log(f"{modified} paths have been modified.") # Once again, this came from the development and is not required anymore, especially # since by default the script is working on copies of the original files. if not preview: # Write the updated database back to the file. con.commit() con.close() # Walks through an XML file and checks _all_ entries. # WARNING: The documentation of this parser explicitly mentions that it's not hardened against # known XML vulnerabilities. It is NOT suitable for unknown/unsafe XML files. Shouldn't be an # issue here though. def update_xml(file: Path, replace_dict: dict, replace_func) -> None: modified, ignored = 0, 0 tree = ET.parse(file) root = tree.getroot() for el in root.iter(): # Exclude a few tags known to contain no paths. # biography, outline: These often contain lots of text (= slow to process) and generate # false-positives for the missed path detection (see recursive_root_path_replacer) if el.tag in ("biography", "outline"): continue el.text, mo, ig = replace_func(el.text, replace_dict) modified += mo ignored += ig print_log(f"Processed {ignored + modified} elements. {modified} paths have been modified.") tree.write(file) # , encoding="utf-8") # Remember if the user wants to ignore all future warnings. user_wants_inplace_warning = True def get_target( source: Path, target: Path, replacements: dict, no_log: bool = False, ) -> Path: # Not the cleanest solution for remembering it between function calls but good enough here. global user_wants_inplace_warning # global all_path_changes source = Path(source) target = Path(target) skip_copy = False # "auto" means the target path is generated by the same path replacement dictionary that's # also used to update all the path strings. # In this case we don't care about the stats returned by recursive_path_replacer, hence # the variable names. if len(target.parts) == 1 and target.name.startswith("auto"): if target.name == "auto-existing": skip_copy = True original_source = original_root / source.relative_to(source_root) target, idgaf1, idgaf2 = recursive_root_path_replacer(original_source, to_replace=replacements) target, idgaf1, idgaf2 = recursive_root_path_replacer(target, to_replace=fs_path_replacements) target = Path(target) if not target.is_absolute(): if target.is_relative_to("/"): # Otherwise the line below will make target relative to the _root_ of target_root # instead of relative to target_root. target = target.relative_to("/") target = target_root / target # If source and target are the same there are two possibilities: # 1. The user actually wants to work on the given source files; maybe he already created # a copy and directly pointed this script towards that copy. # 2. The user forgot that they shouldn't touch the original files. # 3. Something's wrong with the path replacement dict. # In any cases, the user is notified and can decide whether he wants to continue this time, # all the remaining times, too, or abort. # # Program: Are you sure? User: I don't know [yet] usure = "idk" if source == target: if user_wants_inplace_warning: while usure not in "yna": usure = input("Warning! Working on original file! Continue? [Y]es, [N]o, [A]lways ") # j is for the german "ja" which means yes. usure = usure[0].lower().replace("j", "y") if usure == "n": print_log("Skipping this file. If you want to abort the whole process, stop the script" "with CTRL + C.") target = None elif usure == "a": # Don't warn about this anymore. user_wants_inplace_warning = False elif not skip_copy: if not target.parent.exists(): target.parent.mkdir(parents=True) if not no_log: print_log("Copying...", target, end=" ") copy(source, target) if not no_log: print_log("Done.") return target def process_file( source: Path, target: Path, replacements: dict, replace_func, tables:dict = None, copy_only: bool = False, no_log: bool = False, ) -> None: if tables is None: tables = dict() # What do you want me to do with no input? if not target: return # Files only. if target.is_dir(): return if not no_log: print_log("Processing", target) if copy_only: # No need to do any further checks. return elif target.suffix == ".db": # If it's "library.db", save it for later (see comment at declaration): if target.name == "library.db": global library_db_source_path, library_db_target_path library_db_source_path = source library_db_target_path = target # sqlite file. In this case table specifies which tables within that file have columns to check. # Iterate over those. for table, kwargs in tables.items(): print_log("Processing table", table) # The remaining function arguments (**kwards) contain the details about the columns to process. # See update_db_table and/or the todo_list. update_db_table(file=target, replace_dict=replacements, replace_func=replace_func, table=table, **kwargs) elif target.suffix == ".xml" or target.suffix == ".nfo": update_xml(file=target, replace_dict=replacements, replace_func=replace_func) elif target.suffix == ".mblink": # .mblink files only contain a path, nothing else. with open(target, "r", encoding="utf-8") as f: path = f.read() path, modified, ignored = replace_func(path, replacements) print_log(f"Processed {modified + ignored} paths, {modified} paths have been modified.") with open(target, "w", encoding="utf-8") as f: f.write(path) elif target.suffix == ".json": # There are also json files with the ending .js but I haven't found any with paths. # Load the file by the json module (resulting in a dict or list object) and process # them by recursive_path_replacer which handles these structures. with open(target, "r", encoding="utf-8") as f: j = json.load(f) j, modified, ignored = replace_func(j, replacements) print_log(f"Processed {modified + ignored} paths, {modified} paths have been modified.") with open(target, "w", encoding="utf-8") as f: # indent 2 seems to be the default formatting for jellyfin json files. json.dump(j, f, indent=2) # If we're updating path ids we also need to check the paths of the files themselves # and move them if they're relative to a path. # This obviously leaves empty folders behind, which are cleaned up afterwards. if replace_func == recursive_id_path_replacer: source = target target, modified, ignored = recursive_id_path_replacer(source, replacements) if modified: print_log("Changing ID in filepath: ->", target) target = Path(target) target.parent.mkdir(parents=True, exist_ok=True) source.replace(target) # Processes the todo_list. # It handles potential wildcards in the file paths and keeps track # which files have already been processed. This allows you to have an # automatic, wildcard copy in your todo_list that just copies the files # to the (modified) destinations without processing them and without # modifying those that have already been copied _and_ modified. # Obviously this requires you to have the files that need processing # first in the todo_list and only then the wildcard copies. # # lst: job list # process_func: function to apply to jobs of lst. # replace_func: function used by process_func to do the replacing of paths, ... def process_files(lst: list, process_func, replace_func, path_replacements): done = set() for job in lst: if "no_log" not in job: job["no_log"] = False source = job["source"] print_log(f"Current job from todo_list: {source}") if "\*" in str(source): # Path has wildcards, process all matching files. # # Ironically Path.glob can't handle Path objects, hence the need # to convert them to a string... # It is expected that all these paths are relative to source_root. source = source.relative_to(source_root) for src in source_root.glob(str(source)): if src.is_dir(): continue if src in done: # File has already been processed by this script. continue done.add(src) target = get_target( source=src, target=job["target"], replacements=path_replacements, no_log=job["no_log"], ) # pass the job as is but with non-wildcard source path. process_func( replace_func=replace_func, source=src, target=target, **{k: v for k, v in job.items() if k not in ("source", "target")}, ) else: # No wildcards, process the path directly - if it hasn't already # been processed. if source in done: continue done.add(source) target = get_target( source=source, target=job["target"], replacements=path_replacements, no_log=job["no_log"], ) process_func( replace_func=replace_func, source=source, target=target, **{k: v for k, v in job.items() if k not in ("source", "target")}, ) print_log("") # Note: The .NET .Unicode method encodes as UTF16 little endian: # https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding.unicode?view=net-6.0 def get_dotnet_MD5(s: str): return hashlib.md5(s.encode("utf-16-le")).digest() # Derived/copied from update_db_table. I couldn't see a good way to do this without # copying. The data structures and processing are too different for path and id jobs. # Note: kwargs is due to how process_files works. It passes a lot of stuff from the # job list that's not needed here. def update_db_table_ids( source, target, tables, preview=False, \*\*kwargs ): global ids print_log("Updating Item IDs in database... ") # Initialize sqlite3 objects con = sqlite3.connect(target) cur = con.cursor() updated_ids_count = 0 # That's a very nested loop and could probably be written more efficiently using # multiprocessing and more advanced sqlite queries. for table, columns_by_id_type in tables.items(): for id_type, columns in columns_by_id_type.items(): for column in columns: print_log(f"Updating {column} IDs in table {table}...") # See comment about iterating over rows while modifying them in update_db_table. rows = [r for r in cur.execute(f"SELECT DISTINCT `{column}` from `{table}`")] progress = 0 rowcount = len(rows) t = time() for old_id, in rows: progress += 1 # Print the progress every second. Note: this is the only usage of the "progress" variable. now = time() if now - t > 1: print_log(f"Progress: {progress} / {rowcount} rows") t = now if old_id in ids[id_type]: new_id = ids[id_type][old_id] try: cur.execute(f"UPDATE `{table}` SET `{column}` = ? WHERE `{column}` = ?", (new_id, old_id)) except sqlite3.IntegrityError: col_names = [x[0] for x in cur.execute(f"SELECT name FROM PRAGMA_TABLE_INFO('{table}')")] rows = [x for x in cur.execute(f"SELECT * FROM `{table}` WHERE `{column}` = ?", (old_id,))] rows = [dict(zip(col_names, row)) for row in rows] print_log(f"Encountered {len(rows)} duplicated entries") for i, row in enumerate(rows): print_log(f"Deleting ({i+1}/{len(rows)}): ", row) cur.execute(f"DELETE FROM `{table}` WHERE `{column}` = ?", (old_id,)) updated_ids_count += 1 # Once again, this came from the development and is not required anymore, especially # since by default the script is working on copies of the original files. if not preview: # Write the updated database back to the file. con.commit() con.close() print_log(f"{updated_ids_count} IDs updated.") def get_ids(): global library_db_target_path, ids con = sqlite3.connect(library_db_target_path) cur = con.cursor() id_replacements_bin = dict() for guid, item_type, path in cur.execute("SELECT `guid`, `type`, `Path` FROM `TypedBaseItems`"): if not path or path.startswith("%"): continue # Source: https://github.com/jellyfin/jellyfin/blob/7e8428e588b3f0a0574da44081098c64fe1a47d7/Emby.Server.Implementations/Library/LibraryManager.cs#L504 # noqa new_guid = get_dotnet_MD5(item_type + path) # Omit IDs that haven't changed at all. Happens if not _all_ paths are modified if new_guid != guid: id_replacements_bin[guid] = new_guid ### Adapted from jellyfin_id_scanner id_replacements_str = {bid2sid(k): bid2sid(v) for k, v in id_replacements_bin.items()} id_replacements_str_dash = {sid2did(k): sid2did(v) for k, v in id_replacements_str.items()} id_replacements_ancestor_str = {convert_ancestor_id(k): convert_ancestor_id(v) for k, v in id_replacements_str.items()} id_replacements_ancestor_bin = {sid2bid(k): sid2bid(v) for k, v in id_replacements_ancestor_str.items()} id_replacements_ancestor_str_dash = {sid2did(k):sid2did(v) for k, v in id_replacements_ancestor_str.items()} ids = { "bin": id_replacements_bin, "str": id_replacements_str, "str-dash": id_replacements_str_dash, "ancestor-bin": id_replacements_ancestor_bin, "ancestor-str": id_replacements_ancestor_str, "ancestor-str-dash": id_replacements_ancestor_str_dash, } ### End of adapted code # Check for collisions between old and new ids in both the normal and ancestor format. # If there are collisions, get the (new) filepaths causing them uniques = set() duplicates = list() for id in id_replacements_str.values(): if id in uniques: duplicates.append(id) else: uniques.add(id) # if there are duplicates, find the matching old_ids to query the lines from the database if duplicates: old_ids = [] for k, v in id_replacements_str.items(): if v in duplicates: old_ids.append(sid2bid(k)) duplicates_new = [next(cur.execute("SELECT `guid`, `Path` FROM `TypedBaseItems` WHERE `guid` = ?", (guid,))) for guid in old_ids] # also fetch the old paths for better understanding/debugging con.close() con = sqlite3.connect(library_db_source_path) cur = con.cursor() duplicates_old = [next(cur.execute("SELECT `guid`, `Path` FROM `TypedBaseItems` WHERE `guid` = ?", (guid,))) for guid in old_ids] duplicates_old = dict(duplicates_old) con.close() print_log(f"Warning! {len(duplicates)} duplicates detected within new ids. This indicates that you're " f"merging media files from different directories into fewer ones. If that's the case for all the " f"collisions listed below, you can likely ignore this warning, otherwise recheck your path settings. " f"IMPORTANT: The duplicated entries will be removed from the database. You got a backup of the " f"database, right?") print_log("Duplicates: ") for id, newpath in duplicates_new: print_log(f" Item ID: {bid2sid(id)}, Paths (old -> new): {duplicates_old[id]} -> {newpath}") input("Press Enter to continue or CTRL+C to abort. ") return ids def update_ids(): return def jf_date_str_to_python_ns(s: str): # Python datetime has only support for microseconds because of resolution # problems. To convert from a date+time to ticks, the fractional seconds # part doesn't matter anyway (it remains the same). Hence, it's cut off # and added back later. subseconds = "0" if "." in s: s, subseconds = s.rsplit(".", 1) # In case subseconds has a higher resolution than 100ns and/or additional # information (f.ex. timezone, which is known to be UTC+00:00 for jellyfin), # Strip all of it. # Add trailing zeros til the ns digit, then convert to int, and we have ns. subseconds = int(subseconds.split("+")[0].rstrip(ascii_letters).ljust(9, "0")) # Add explicit information about the timezone (UTC+00:00) s += "+00:00" t = int(datetime.datetime.fromisoformat(s).timestamp()) # Convert to ns t \*= 1000000000 t += subseconds return t # Convert a _python_ timestamp (float seconds since epoch, which is os dependent) # to a ISO like date string as found in the jellyfin database. I have no idea # if this works for all OS'es in all timezones. Very likely not but that whole # topic is about as much of a mess as jellyfin's databases. If you got any issues, # I'm sorry. If you find a solution, them, please let me know! def get_datestr_from_python_time_ns(time_ns: int): # Datetime has no support for sub-microsecond resolution (which is required here). # Doesn't matter anyway, we can add the whole sub-second part afterwards. time_s = time_ns // 1000000000 time_frac_s_100ns = (time_ns // 100) % 10000000 timestamp = datetime.datetime.utcfromtimestamp(time_s).isoformat(sep=" ", timespec="seconds") # Add back the sub-seconds part and the UTC time zone timestamp += "." + str(time_frac_s_100ns).rjust(7, "0").rstrip("0") + "Z" return timestamp def delete_empty_folders(dir:str): dir = Path(dir) done = False while not done: done = True for p in dir.glob("**"): if not list(p.iterdir()): print_log("Removing empty folder", p) p.rmdir() done = False def update_file_dates(): global library_db_target_path, fs_path_replacements print_log("Updating file dates... Note: Reading file dates seems to be quite slow. " "This will take a couple minutes") con = sqlite3.connect(library_db_target_path) cur = con.cursor() rows = [r for r in cur.execute("SELECT `rowid`, `Path`, `DateCreated`, `DateModified` FROM `TypedBaseItems`")] progress = 0 rowcount = len(rows) t = time() for rowid, target, date_created, date_modified in rows: progress += 1 # Print the progress every second. Note: this is the only usage of the "progress" variable. now = time() if now - t > 1: print_log(f"Progress: {progress} / {rowcount} rows") t = now if not target: continue # Determine file path as seen by this script (see fs_path_replacements for details) # Code taken from get_target target, idgaf1, idgaf2 = recursive_root_path_replacer(target, to_replace=fs_path_replacements) target = Path(target) if not target.is_absolute(): if target.is_relative_to("/"): # Otherwise the line below will make target relative to the _root_ of target_root # instead of relative to target_root. target = target.relative_to("/") target = target_root / target # End of code taken from get_target if not target.exists(): print_log("File doesn't seem to exist; can't update its dates in the database: ", target) continue date_created_ns = jf_date_str_to_python_ns(date_created) date_modified_ns = jf_date_str_to_python_ns(date_modified) if date_created_ns >= 0 and date_modified_ns >= 0: continue filestats = os.stat(target) if date_created_ns < 0: new_date_created = get_datestr_from_python_time_ns(filestats.st_ctime_ns) cur.execute("UPDATE `TypedBaseItems` SET `DateCreated` = ? WHERE `rowid` = ?", (new_date_created, rowid)) if date_modified_ns < 0: new_date_modified = get_datestr_from_python_time_ns(filestats.st_mtime_ns) cur.execute("UPDATE `TypedBaseItems` SET `DateModified` = ? WHERE `rowid` = ?", (new_date_modified, rowid)) con.commit() print_log("Done.") if **name** == "**main**": print_log("") print_log("Starting Jellyfin Database Migration") ### Copy relevant files and adjust all paths to the new locations. process_files( todo_list_paths, process_func=process_file, replace_func=recursive_root_path_replacer, path_replacements=path_replacements, ) ### Update IDs # Generate IDs based on those new paths and save them in the global variable get_ids() # ID types occurring in paths (<- search for that to find another comment with more details if you missed it) # Include/Exclude types (see get_ids) to specify which are used for looking through paths. # Currently, all are included, just to be safe. id_replacements_path = {**ids["ancestor-str"], **ids["ancestor-str-dash"], **ids["str"], **ids["str-dash"], "target_path_slash": path_replacements["target_path_slash"]} # To (mostly) reuse the same functions from step 1, the replacements dict needs to be updated with # id_replacements_path. It can't be replaced since it's also used to find the files (which uses the # same source -> target processing/conversion as step 1). In theory this alters the process since # the dict used to convert from source -> target is different, in reality, this is not an issue, # since step 1 only processes the roots of the paths (which cannot be similar to anything in # id_replacements_path). for i, job in enumerate(todo_list_id_paths): todo_list_id_paths[i]["replacements"] = id_replacements_path # Replace all paths with ids - both in the file system and within files. process_files( todo_list_id_paths, process_func=process_file, replace_func=recursive_id_path_replacer, path_replacements={**path_replacements, **id_replacements_path}, ) # Clean up empty folders that may be left behind in the target directory #delete_empty_folders(target_root) # Replace remaining ids. process_files( todo_list_ids, process_func=update_db_table_ids, replace_func=None, path_replacements = path_replacements, ) # Finally, update the file dates in the db. update_file_dates() print_log("") print_log("Jellyfin Database Migration complete.")