From 7961cc47f4ce5c4a67bed7b2f818726751c24e85 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Fri, 19 Apr 2024 14:21:59 +0000 Subject: [PATCH 01/34] hi --- src/kagglehub/gcs_upload.py | 132 ++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 51 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index c551d350..4db52c5f 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -7,7 +7,7 @@ from multiprocessing import Pool from pathlib import Path from tempfile import TemporaryDirectory -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import requests from requests.exceptions import ConnectionError, Timeout @@ -75,7 +75,7 @@ def _check_uploaded_size(session_uri: str, file_size: int, backoff_factor: int = return 0 # Return 0 if all retries fail -def _upload_blob(file_path: str, model_type: str) -> str: +def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: """Uploads a file to a remote server as a blob and returns an upload token. Parameters @@ -84,9 +84,11 @@ def _upload_blob(file_path: str, model_type: str) -> str: model_type : The type of the model associated with the file. """ file_size = os.path.getsize(file_path) + string_paths = [str(path) for path in relative_path] + print(relative_path) data = { "type": model_type, - "name": os.path.basename(file_path), + "name": relative_path, "contentLength": file_size, "lastModifiedEpochSeconds": int(os.path.getmtime(file_path)), } @@ -137,52 +139,80 @@ def _upload_blob(file_path: str, model_type: str) -> str: return response["token"] +def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str]: # noqa: FBT002, FBT001 + # Count the total number of files + file_count = 0 + for _, _, files in os.walk(folder): + file_count += len(files) + + if file_count > MAX_FILES_TO_UPLOAD: + if not quiet: + logger.info(f"More than {MAX_FILES_TO_UPLOAD} files detected, creating a zip archive...") + + with TemporaryDirectory() as temp_dir: + zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) + with zipfile.ZipFile(zip_path, "w") as zipf: + for root, _, files in os.walk(folder): + for file in files: + file_path = os.path.join(root, file) + zipf.write(file_path, os.path.relpath(file_path, folder)) + + # Upload the zip file + return [ + token + for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, folder, model_type, quiet)] + if token is not None + ] + + tokens = [] + for root, _, files in os.walk(folder): + for file in files: + token = _upload_file_or_folder(root, file, folder, model_type, quiet) + if token is not None: + tokens.append(token) + + return tokens + + +def _upload_file_or_folder( + parent_path: str, file_or_folder_name: str, base_path: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 +) -> Optional[str]: + """ + Uploads a file or each file inside a folder individually from a specified path to a remote service. + Parameters + ========== + parent_path: The parent directory path from where the file or folder is to be uploaded. + file_or_folder_name: The name of the file or folder to be uploaded. + dir_mode: The mode to handle directories. Accepts 'zip', 'tar', or other values for skipping. + model_type: Type of the model that is being uploaded. + quiet: suppress verbose output (default is False) + :return: A token if the upload is successful, or None if the file is skipped or the upload fails. + """ + full_path = os.path.join(parent_path, file_or_folder_name) + relative_path = os.path.relpath(full_path, start=base_path) + if os.path.isfile(full_path): + return _upload_file(file_or_folder_name, full_path, relative_path, quiet, model_type) + elif not quiet: + logger.info("Skipping: " + file_or_folder_name) + return None + + +def _upload_file(file_name: str, full_path: str, relative_path: str, quiet: bool, model_type: str) -> Optional[str]: # noqa: FBT001 + """Helper function to upload a single file + Parameters + ========== + file_name: name of the file to upload + full_path: path to the file to upload + quiet: suppress verbose output + model_type: Type of the model that is being uploaded. + :return: None - upload unsuccessful; instance of UploadFile - upload successful + """ + + if not quiet: + logger.info("Starting upload for file " + file_name) -def zip_file(args: Tuple[Path, Path, Path]) -> int: - file_path, zip_path, source_path_obj = args - arcname = file_path.relative_to(source_path_obj) - size = file_path.stat().st_size - with zipfile.ZipFile(zip_path, "a", zipfile.ZIP_STORED, allowZip64=True) as zipf: - zipf.write(file_path, arcname) - return size - - -def zip_files(source_path_obj: Path, zip_path: Path) -> List[int]: - files = [file for file in source_path_obj.rglob("*") if file.is_file()] - args = [(file, zip_path, source_path_obj) for file in files] - - with Pool() as pool: - sizes = pool.map(zip_file, args) - return sizes - - -def upload_files(source_path: str, model_type: str) -> List[str]: - source_path_obj = Path(source_path) - with TemporaryDirectory() as temp_dir: - temp_dir_path = Path(temp_dir) - total_size = 0 - - if source_path_obj.is_dir(): - for file_path in source_path_obj.rglob("*"): - if file_path.is_file(): - total_size += file_path.stat().st_size - elif source_path_obj.is_file(): - total_size = source_path_obj.stat().st_size - else: - path_error_message = "The source path does not point to a valid file or directory." - raise ValueError(path_error_message) - - with tqdm(total=total_size, desc="Zipping", unit="B", unit_scale=True, unit_divisor=1024) as pbar: - if source_path_obj.is_dir(): - zip_path = temp_dir_path / "archive.zip" - sizes = zip_files(source_path_obj, zip_path) - for size in sizes: - pbar.update(size) - upload_path = str(zip_path) - elif source_path_obj.is_file(): - temp_file_path = temp_dir_path / source_path_obj.name - shutil.copy(source_path_obj, temp_file_path) - pbar.update(temp_file_path.stat().st_size) - upload_path = str(temp_file_path) - - return [token for token in [_upload_blob(upload_path, model_type)] if token] + content_length = os.path.getsize(full_path) + token = _upload_blob(full_path, model_type, relative_path) + if not quiet: + logger.info("Upload successful: " + file_name + " (" + File.get_size(content_length) + ")") + return token From 2a37e7e3ebabbfefbbe3a1513c0a7654482866d9 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Fri, 19 Apr 2024 14:21:59 +0000 Subject: [PATCH 02/34] hi --- src/kagglehub/gcs_upload.py | 164 ++++++++++++++++++++++---------- src/kagglehub/models_helpers.py | 26 +++-- 2 files changed, 131 insertions(+), 59 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index c551d350..d4b0b90a 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -7,7 +7,7 @@ from multiprocessing import Pool from pathlib import Path from tempfile import TemporaryDirectory -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import requests from requests.exceptions import ConnectionError, Timeout @@ -25,6 +25,18 @@ REQUEST_TIMEOUT = 600 +class UploadFileInfo: + token: str + +class Directory: + name: str + files: List[UploadFileInfo] + directories: List['Directory'] + +FileStructure = Union[UploadFileInfo, Directory] + + + def parse_datetime_string(string: str) -> Union[datetime, str]: time_formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%fZ"] for t in time_formats: @@ -75,7 +87,7 @@ def _check_uploaded_size(session_uri: str, file_size: int, backoff_factor: int = return 0 # Return 0 if all retries fail -def _upload_blob(file_path: str, model_type: str) -> str: +def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: """Uploads a file to a remote server as a blob and returns an upload token. Parameters @@ -84,9 +96,11 @@ def _upload_blob(file_path: str, model_type: str) -> str: model_type : The type of the model associated with the file. """ file_size = os.path.getsize(file_path) + string_paths = [str(path) for path in relative_path] + print(relative_path) data = { "type": model_type, - "name": os.path.basename(file_path), + "name": str(relative_path), "contentLength": file_size, "lastModifiedEpochSeconds": int(os.path.getmtime(file_path)), } @@ -137,52 +151,100 @@ def _upload_blob(file_path: str, model_type: str) -> str: return response["token"] +def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str]: # noqa: FBT002, FBT001 + # Count the total number of files + # file_count = 0 + # for _, _, files in os.walk(folder): + # file_count += len(files) + + # if file_count > MAX_FILES_TO_UPLOAD: + # if not quiet: + # logger.info(f"More than {MAX_FILES_TO_UPLOAD} files detected, creating a zip archive...") + + # with TemporaryDirectory() as temp_dir: + # zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) + # with zipfile.ZipFile(zip_path, "w") as zipf: + # for root, _, files in os.walk(folder): + # for file in files: + # file_path = os.path.join(root, file) + # zipf.write(file_path, os.path.relpath(file_path, folder)) + + # # Upload the zip file + # return [ + # token + # for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, folder, model_type, quiet)] + # if token is not None + # ] + + root_dict = {'files': [], 'directories': []} + for root, dirs, files in os.walk(folder): + # Path of the current folder relative to the base folder + path = os.path.relpath(root, folder) + + # Navigate or create the dictionary path to the current folder + current_dict = root_dict + if path != ".": + for part in path.split(os.sep): + # Find or create the subdirectory in the current dictionary + for subdir in current_dict['directories']: + if subdir['name'] == part: + current_dict = subdir + break + else: + # If the directory is not found, create a new one + new_dir = {'name': part, 'files': [], 'directories': []} + current_dict['directories'].append(new_dir) + current_dict = new_dir + + # Add file tokens to the current directory in the dictionary + for file in files: + token = _upload_file_or_folder(root, file, folder, model_type, quiet) + if token: + current_dict['files'].append(token) + + + return root_dict + + +def _upload_file_or_folder( + parent_path: str, file_or_folder_name: str, base_path: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 +) -> Optional[str]: + """ + Uploads a file or each file inside a folder individually from a specified path to a remote service. + Parameters + ========== + parent_path: The parent directory path from where the file or folder is to be uploaded. + file_or_folder_name: The name of the file or folder to be uploaded. + dir_mode: The mode to handle directories. Accepts 'zip', 'tar', or other values for skipping. + model_type: Type of the model that is being uploaded. + quiet: suppress verbose output (default is False) + :return: A token if the upload is successful, or None if the file is skipped or the upload fails. + """ + full_path = os.path.join(parent_path, file_or_folder_name) + relative_path = os.path.relpath(full_path, start=base_path) + if os.path.isfile(full_path): + return _upload_file(file_or_folder_name, full_path, relative_path, quiet, model_type) + elif not quiet: + logger.info("Skipping: " + file_or_folder_name) + return None + + +def _upload_file(file_name: str, full_path: str, relative_path: str, quiet: bool, model_type: str) -> Optional[str]: # noqa: FBT001 + """Helper function to upload a single file + Parameters + ========== + file_name: name of the file to upload + full_path: path to the file to upload + quiet: suppress verbose output + model_type: Type of the model that is being uploaded. + :return: None - upload unsuccessful; instance of UploadFile - upload successful + """ + + if not quiet: + logger.info("Starting upload for file " + file_name) -def zip_file(args: Tuple[Path, Path, Path]) -> int: - file_path, zip_path, source_path_obj = args - arcname = file_path.relative_to(source_path_obj) - size = file_path.stat().st_size - with zipfile.ZipFile(zip_path, "a", zipfile.ZIP_STORED, allowZip64=True) as zipf: - zipf.write(file_path, arcname) - return size - - -def zip_files(source_path_obj: Path, zip_path: Path) -> List[int]: - files = [file for file in source_path_obj.rglob("*") if file.is_file()] - args = [(file, zip_path, source_path_obj) for file in files] - - with Pool() as pool: - sizes = pool.map(zip_file, args) - return sizes - - -def upload_files(source_path: str, model_type: str) -> List[str]: - source_path_obj = Path(source_path) - with TemporaryDirectory() as temp_dir: - temp_dir_path = Path(temp_dir) - total_size = 0 - - if source_path_obj.is_dir(): - for file_path in source_path_obj.rglob("*"): - if file_path.is_file(): - total_size += file_path.stat().st_size - elif source_path_obj.is_file(): - total_size = source_path_obj.stat().st_size - else: - path_error_message = "The source path does not point to a valid file or directory." - raise ValueError(path_error_message) - - with tqdm(total=total_size, desc="Zipping", unit="B", unit_scale=True, unit_divisor=1024) as pbar: - if source_path_obj.is_dir(): - zip_path = temp_dir_path / "archive.zip" - sizes = zip_files(source_path_obj, zip_path) - for size in sizes: - pbar.update(size) - upload_path = str(zip_path) - elif source_path_obj.is_file(): - temp_file_path = temp_dir_path / source_path_obj.name - shutil.copy(source_path_obj, temp_file_path) - pbar.update(temp_file_path.stat().st_size) - upload_path = str(temp_file_path) - - return [token for token in [_upload_blob(upload_path, model_type)] if token] + content_length = os.path.getsize(full_path) + token = _upload_blob(full_path, model_type, relative_path) + if not quiet: + logger.info("Upload successful: " + file_name + " (" + File.get_size(content_length) + ")") + return token diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 4bdaef10..21adc302 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -1,6 +1,6 @@ import logging from http import HTTPStatus -from typing import List, Optional +from typing import List, Optional, Union from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError @@ -8,6 +8,13 @@ logger = logging.getLogger(__name__) +class Directory: + name: str + files: List[str] + directories: List['Directory'] + +FileStructure = List[Directory] + def _create_model(owner_slug: str, model_slug: str) -> None: data = {"ownerSlug": owner_slug, "slug": model_slug, "title": model_slug, "isPrivate": True} @@ -16,11 +23,14 @@ def _create_model(owner_slug: str, model_slug: str) -> None: logger.info(f"Model '{model_slug}' Created.") -def _create_model_instance(model_handle: ModelHandle, files: List[str], license_name: Optional[str] = None) -> None: +def _create_model_instance(model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None) -> None: + print([subdir for subdir in files_and_directories['directories']]) + print([{"token": file_token} for file_token in files_and_directories['files']]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, - "files": [{"token": file_token} for file_token in files], + "files": [{"token": file_token} for file_token in files_and_directories['files']], + "directories": [subdir for subdir in files_and_directories['directories']] } if license_name is not None: data["licenseName"] = license_name @@ -30,8 +40,8 @@ def _create_model_instance(model_handle: ModelHandle, files: List[str], license_ logger.info(f"Your model instance has been created.\nFiles are being processed...\nSee at: {model_handle.to_url()}") -def _create_model_instance_version(model_handle: ModelHandle, files: List[str], version_notes: str = "") -> None: - data = {"versionNotes": version_notes, "files": [{"token": file_token} for file_token in files]} +def _create_model_instance_version(model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "") -> None: + data = {"versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories]} api_client = KaggleApiV1Client() api_client.post( f"/models/{model_handle.owner}/{model_handle.model}/{model_handle.framework}/{model_handle.variation}/create/version", @@ -43,19 +53,19 @@ def _create_model_instance_version(model_handle: ModelHandle, files: List[str], def create_model_instance_or_version( - model_handle: ModelHandle, files: List[str], license_name: Optional[str], version_notes: str = "" + model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str], version_notes: str = "" ) -> None: try: api_client = KaggleApiV1Client() api_client.get(f"/models/{model_handle}/get", model_handle) # the instance exist, create a new version. - _create_model_instance_version(model_handle, files, version_notes) + _create_model_instance_version(model_handle, files_and_directories, version_notes) except KaggleApiHTTPError as e: if e.response is not None and ( e.response.status_code == HTTPStatus.NOT_FOUND # noqa: PLR1714 or e.response.status_code == HTTPStatus.FORBIDDEN ): - _create_model_instance(model_handle, files, license_name) + _create_model_instance(model_handle, files_and_directories, license_name) else: raise (e) From 79719f6d8b4f791fcf4d00570794c7396d0a46a9 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 13:50:22 +0000 Subject: [PATCH 03/34] remove --- src/kagglehub/gcs_upload.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 38b86c2a..92a26ac5 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -8,7 +8,6 @@ from pathlib import Path from tempfile import TemporaryDirectory from typing import List, Optional, Tuple, Union -from typing import List, Optional, Tuple, Union import requests from requests.exceptions import ConnectionError, Timeout @@ -88,7 +87,6 @@ def _check_uploaded_size(session_uri: str, file_size: int, backoff_factor: int = return 0 # Return 0 if all retries fail -def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: """Uploads a file to a remote server as a blob and returns an upload token. From 94ea4a3daad55c73ad2f8999bdefd57530fd6edf Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 13:53:45 +0000 Subject: [PATCH 04/34] r --- src/kagglehub/gcs_upload.py | 63 ++++++++++++++++++--------------- src/kagglehub/models_helpers.py | 17 ++++++--- 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 92a26ac5..2c8912fc 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -28,13 +28,14 @@ class UploadFileInfo: token: str + class Directory: name: str files: List[UploadFileInfo] directories: List['Directory'] -FileStructure = Union[UploadFileInfo, Directory] +FileStructure = Union[UploadFileInfo, Directory] def parse_datetime_string(string: str) -> Union[datetime, str]: @@ -98,8 +99,6 @@ def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: file_size = os.path.getsize(file_path) string_paths = [str(path) for path in relative_path] print(relative_path) - string_paths = [str(path) for path in relative_path] - print(relative_path) data = { "type": model_type, "name": str(relative_path), @@ -153,36 +152,37 @@ def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: return response["token"] + def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str]: # noqa: FBT002, FBT001 # Count the total number of files - # file_count = 0 - # for _, _, files in os.walk(folder): - # file_count += len(files) - - # if file_count > MAX_FILES_TO_UPLOAD: - # if not quiet: - # logger.info(f"More than {MAX_FILES_TO_UPLOAD} files detected, creating a zip archive...") - - # with TemporaryDirectory() as temp_dir: - # zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) - # with zipfile.ZipFile(zip_path, "w") as zipf: - # for root, _, files in os.walk(folder): - # for file in files: - # file_path = os.path.join(root, file) - # zipf.write(file_path, os.path.relpath(file_path, folder)) - - # # Upload the zip file - # return [ - # token - # for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, folder, model_type, quiet)] - # if token is not None - # ] + file_count = 0 + for _, _, files in os.walk(folder): + file_count += len(files) + + if file_count > MAX_FILES_TO_UPLOAD: + if not quiet: + logger.info(f"More than {MAX_FILES_TO_UPLOAD} files detected, creating a zip archive...") + + with TemporaryDirectory() as temp_dir: + zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) + with zipfile.ZipFile(zip_path, "w") as zipf: + for root, _, files in os.walk(folder): + for file in files: + file_path = os.path.join(root, file) + zipf.write(file_path, os.path.relpath(file_path, folder)) + + # Upload the zip file + return [ + token + for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, folder, model_type, quiet)] + if token is not None + ] root_dict = {'files': [], 'directories': []} for root, dirs, files in os.walk(folder): # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) - + # Navigate or create the dictionary path to the current folder current_dict = root_dict if path != ".": @@ -204,12 +204,15 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] if token: current_dict['files'].append(token) - return root_dict def _upload_file_or_folder( - parent_path: str, file_or_folder_name: str, base_path: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 + parent_path: str, + file_or_folder_name: str, + base_path: str, + model_type: str, + quiet: bool = False, # noqa: FBT002, FBT001 ) -> Optional[str]: """ Uploads a file or each file inside a folder individually from a specified path to a remote service. @@ -231,7 +234,9 @@ def _upload_file_or_folder( return None -def _upload_file(file_name: str, full_path: str, relative_path: str, quiet: bool, model_type: str) -> Optional[str]: # noqa: FBT001 +def _upload_file( + file_name: str, full_path: str, relative_path: str, quiet: bool, model_type: str +) -> Optional[str]: # noqa: FBT001 """Helper function to upload a single file Parameters ========== diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 21adc302..f3229254 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -8,11 +8,13 @@ logger = logging.getLogger(__name__) + class Directory: name: str files: List[str] directories: List['Directory'] + FileStructure = List[Directory] @@ -23,14 +25,16 @@ def _create_model(owner_slug: str, model_slug: str) -> None: logger.info(f"Model '{model_slug}' Created.") -def _create_model_instance(model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None) -> None: +def _create_model_instance( + model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None +) -> None: print([subdir for subdir in files_and_directories['directories']]) print([{"token": file_token} for file_token in files_and_directories['files']]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories['files']], - "directories": [subdir for subdir in files_and_directories['directories']] + "directories": [subdir for subdir in files_and_directories['directories']], } if license_name is not None: data["licenseName"] = license_name @@ -40,7 +44,9 @@ def _create_model_instance(model_handle: ModelHandle, files_and_directories: Fil logger.info(f"Your model instance has been created.\nFiles are being processed...\nSee at: {model_handle.to_url()}") -def _create_model_instance_version(model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "") -> None: +def _create_model_instance_version( + model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "" +) -> None: data = {"versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories]} api_client = KaggleApiV1Client() api_client.post( @@ -53,7 +59,10 @@ def _create_model_instance_version(model_handle: ModelHandle, files_and_director def create_model_instance_or_version( - model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str], version_notes: str = "" + model_handle: ModelHandle, + files_and_directories: FileStructure, + license_name: Optional[str], + version_notes: str = "", ) -> None: try: api_client = KaggleApiV1Client() From be6406bffa9cce1638e3e8df0259eb0c56e7a5a2 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 14:09:19 +0000 Subject: [PATCH 05/34] r --- src/kagglehub/gcs_upload.py | 16 ++++++---------- src/kagglehub/models_helpers.py | 17 ++++------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 2c8912fc..4746cf38 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -88,7 +88,7 @@ def _check_uploaded_size(session_uri: str, file_size: int, backoff_factor: int = return 0 # Return 0 if all retries fail -def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: +def _upload_blob(file_path: str, model_type: str) -> str: """Uploads a file to a remote server as a blob and returns an upload token. Parameters @@ -97,11 +97,9 @@ def _upload_blob(file_path: str, model_type: str, relative_path: str) -> str: model_type : The type of the model associated with the file. """ file_size = os.path.getsize(file_path) - string_paths = [str(path) for path in relative_path] - print(relative_path) data = { "type": model_type, - "name": str(relative_path), + "name": str(file_path), "contentLength": file_size, "lastModifiedEpochSeconds": int(os.path.getmtime(file_path)), } @@ -174,7 +172,7 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] # Upload the zip file return [ token - for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, folder, model_type, quiet)] + for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] if token is not None ] @@ -210,7 +208,6 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] def _upload_file_or_folder( parent_path: str, file_or_folder_name: str, - base_path: str, model_type: str, quiet: bool = False, # noqa: FBT002, FBT001 ) -> Optional[str]: @@ -226,16 +223,15 @@ def _upload_file_or_folder( :return: A token if the upload is successful, or None if the file is skipped or the upload fails. """ full_path = os.path.join(parent_path, file_or_folder_name) - relative_path = os.path.relpath(full_path, start=base_path) if os.path.isfile(full_path): - return _upload_file(file_or_folder_name, full_path, relative_path, quiet, model_type) + return _upload_file(file_or_folder_name, full_path, quiet, model_type) elif not quiet: logger.info("Skipping: " + file_or_folder_name) return None def _upload_file( - file_name: str, full_path: str, relative_path: str, quiet: bool, model_type: str + file_name: str, full_path: str, quiet: bool, model_type: str ) -> Optional[str]: # noqa: FBT001 """Helper function to upload a single file Parameters @@ -251,7 +247,7 @@ def _upload_file( logger.info("Starting upload for file " + file_name) content_length = os.path.getsize(full_path) - token = _upload_blob(full_path, model_type, relative_path) + token = _upload_blob(full_path, model_type) if not quiet: logger.info("Upload successful: " + file_name + " (" + File.get_size(content_length) + ")") return token diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index f3229254..21adc302 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -8,13 +8,11 @@ logger = logging.getLogger(__name__) - class Directory: name: str files: List[str] directories: List['Directory'] - FileStructure = List[Directory] @@ -25,16 +23,14 @@ def _create_model(owner_slug: str, model_slug: str) -> None: logger.info(f"Model '{model_slug}' Created.") -def _create_model_instance( - model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None -) -> None: +def _create_model_instance(model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None) -> None: print([subdir for subdir in files_and_directories['directories']]) print([{"token": file_token} for file_token in files_and_directories['files']]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories['files']], - "directories": [subdir for subdir in files_and_directories['directories']], + "directories": [subdir for subdir in files_and_directories['directories']] } if license_name is not None: data["licenseName"] = license_name @@ -44,9 +40,7 @@ def _create_model_instance( logger.info(f"Your model instance has been created.\nFiles are being processed...\nSee at: {model_handle.to_url()}") -def _create_model_instance_version( - model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "" -) -> None: +def _create_model_instance_version(model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "") -> None: data = {"versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories]} api_client = KaggleApiV1Client() api_client.post( @@ -59,10 +53,7 @@ def _create_model_instance_version( def create_model_instance_or_version( - model_handle: ModelHandle, - files_and_directories: FileStructure, - license_name: Optional[str], - version_notes: str = "", + model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str], version_notes: str = "" ) -> None: try: api_client = KaggleApiV1Client() From 4801a74b79ccef624aca1377c2281962c694801a Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 14:50:18 +0000 Subject: [PATCH 06/34] r --- src/kagglehub/gcs_upload.py | 66 ++++++++++++++++++--------------- src/kagglehub/models_helpers.py | 23 +++++++++--- tests/test_model_upload.py | 12 +++--- 3 files changed, 60 insertions(+), 41 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 4746cf38..6f87d958 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -99,7 +99,7 @@ def _upload_blob(file_path: str, model_type: str) -> str: file_size = os.path.getsize(file_path) data = { "type": model_type, - "name": str(file_path), + "name": os.path.basename(file_path), "contentLength": file_size, "lastModifiedEpochSeconds": int(os.path.getmtime(file_path)), } @@ -170,37 +170,45 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] zipf.write(file_path, os.path.relpath(file_path, folder)) # Upload the zip file - return [ + return {'files': [ token for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] if token is not None - ] + ], 'directories': []} root_dict = {'files': [], 'directories': []} - for root, dirs, files in os.walk(folder): - # Path of the current folder relative to the base folder - path = os.path.relpath(root, folder) - - # Navigate or create the dictionary path to the current folder - current_dict = root_dict - if path != ".": - for part in path.split(os.sep): - # Find or create the subdirectory in the current dictionary - for subdir in current_dict['directories']: - if subdir['name'] == part: - current_dict = subdir - break - else: - # If the directory is not found, create a new one - new_dir = {'name': part, 'files': [], 'directories': []} - current_dict['directories'].append(new_dir) - current_dict = new_dir - - # Add file tokens to the current directory in the dictionary - for file in files: - token = _upload_file_or_folder(root, file, folder, model_type, quiet) - if token: - current_dict['files'].append(token) + if os.path.isfile(folder): + # Directly upload the file if the path is a file + file_name = os.path.basename(folder) + token = _upload_file_or_folder(os.path.dirname(folder), file_name, model_type, quiet) + if token: + root_dict['files'].append(token) + else: + for root, dirs, files in os.walk(folder): + print("dfkafsss") + # Path of the current folder relative to the base folder + path = os.path.relpath(root, folder) + + # Navigate or create the dictionary path to the current folder + current_dict = root_dict + if path != ".": + for part in path.split(os.sep): + # Find or create the subdirectory in the current dictionary + for subdir in current_dict['directories']: + if subdir['name'] == part: + current_dict = subdir + break + else: + # If the directory is not found, create a new one + new_dir = {'name': part, 'files': [], 'directories': []} + current_dict['directories'].append(new_dir) + current_dict = new_dir + + # Add file tokens to the current directory in the dictionary + for file in files: + token = _upload_file_or_folder(root, file, model_type, quiet) + if token: + current_dict['files'].append(token) return root_dict @@ -230,9 +238,7 @@ def _upload_file_or_folder( return None -def _upload_file( - file_name: str, full_path: str, quiet: bool, model_type: str -) -> Optional[str]: # noqa: FBT001 +def _upload_file(file_name: str, full_path: str, quiet: bool, model_type: str) -> Optional[str]: # noqa: FBT001 """Helper function to upload a single file Parameters ========== diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 21adc302..60ca5b73 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -8,11 +8,13 @@ logger = logging.getLogger(__name__) + class Directory: name: str files: List[str] directories: List['Directory'] + FileStructure = List[Directory] @@ -23,14 +25,16 @@ def _create_model(owner_slug: str, model_slug: str) -> None: logger.info(f"Model '{model_slug}' Created.") -def _create_model_instance(model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None) -> None: +def _create_model_instance( + model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None +) -> None: print([subdir for subdir in files_and_directories['directories']]) print([{"token": file_token} for file_token in files_and_directories['files']]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories['files']], - "directories": [subdir for subdir in files_and_directories['directories']] + "directories": [subdir for subdir in files_and_directories['directories']], } if license_name is not None: data["licenseName"] = license_name @@ -40,8 +44,14 @@ def _create_model_instance(model_handle: ModelHandle, files_and_directories: Fil logger.info(f"Your model instance has been created.\nFiles are being processed...\nSee at: {model_handle.to_url()}") -def _create_model_instance_version(model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "") -> None: - data = {"versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories]} +def _create_model_instance_version( + model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "" +) -> None: + data = { + "versionNotes": version_notes, + "files": [{"token": file_token} for file_token in files_and_directories['files']], + "directories": [subdir for subdir in files_and_directories['directories']], + } api_client = KaggleApiV1Client() api_client.post( f"/models/{model_handle.owner}/{model_handle.model}/{model_handle.framework}/{model_handle.variation}/create/version", @@ -53,7 +63,10 @@ def _create_model_instance_version(model_handle: ModelHandle, files_and_director def create_model_instance_or_version( - model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str], version_notes: str = "" + model_handle: ModelHandle, + files_and_directories: FileStructure, + license_name: Optional[str], + version_notes: str = "", ) -> None: try: api_client = KaggleApiV1Client() diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index a13cc68f..22c52868 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -140,7 +140,7 @@ def test_model_upload_instance_with_valid_handle(self) -> None: test_filepath.touch() # Create a temporary file in the temporary directory model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, APACHE_LICENSE, "model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_instance_with_nested_directories(self) -> None: # execution path: get_model -> create_model -> get_instance -> create_version @@ -156,7 +156,7 @@ def test_model_upload_instance_with_nested_directories(self) -> None: test_filepath.touch() model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, APACHE_LICENSE, "model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_version_with_valid_handle(self) -> None: # execution path: get_model -> get_instance -> create_instance @@ -168,7 +168,7 @@ def test_model_upload_version_with_valid_handle(self) -> None: test_filepath.touch() # Create a temporary file in the temporary directory model_upload("metaresearch/llama-2/pyTorch/7b", temp_dir, APACHE_LICENSE, "model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_with_too_many_files(self) -> None: with create_test_http_server(KaggleAPIHandler): @@ -199,7 +199,7 @@ def test_model_upload_resumable(self) -> None: # Check that GcsAPIHandler received two PUT requests self.assertEqual(GcsAPIHandler.put_requests_count, 2) self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_with_none_license(self) -> None: with create_test_http_server(KaggleAPIHandler): @@ -209,7 +209,7 @@ def test_model_upload_with_none_license(self) -> None: test_filepath.touch() # Create a temporary file in the temporary directory model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, None, "model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_without_license(self) -> None: with create_test_http_server(KaggleAPIHandler): @@ -219,7 +219,7 @@ def test_model_upload_without_license(self) -> None: test_filepath.touch() # Create a temporary file in the temporary directory model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, version_notes="model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) - self.assertIn(TEMP_ARCHIVE_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + self.assertIn(TEMP_TEST_FILE, KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) def test_model_upload_with_invalid_license_fails(self) -> None: with create_test_http_server(KaggleAPIHandler): From 35c9c5c8abee530e36dd5adedbb6716e9eff2d01 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 14:50:39 +0000 Subject: [PATCH 07/34] r --- src/kagglehub/gcs_upload.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 6f87d958..78b4da45 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -170,11 +170,14 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] zipf.write(file_path, os.path.relpath(file_path, folder)) # Upload the zip file - return {'files': [ - token - for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] - if token is not None - ], 'directories': []} + return { + 'files': [ + token + for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] + if token is not None + ], + 'directories': [], + } root_dict = {'files': [], 'directories': []} if os.path.isfile(folder): From be68ec9cc4ef68208509b1d6d39c252e08c914ff Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 14:59:32 +0000 Subject: [PATCH 08/34] r --- src/kagglehub/gcs_upload.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 78b4da45..ab42a2c7 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -1,13 +1,10 @@ import logging import os -import shutil import time import zipfile from datetime import datetime -from multiprocessing import Pool -from pathlib import Path from tempfile import TemporaryDirectory -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import requests from requests.exceptions import ConnectionError, Timeout @@ -32,7 +29,7 @@ class UploadFileInfo: class Directory: name: str files: List[UploadFileInfo] - directories: List['Directory'] + directories: List["Directory"] FileStructure = Union[UploadFileInfo, Directory] @@ -171,12 +168,12 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] # Upload the zip file return { - 'files': [ + "files": [ token for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] if token is not None ], - 'directories': [], + "directories": [], } root_dict = {'files': [], 'directories': []} @@ -185,10 +182,9 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] file_name = os.path.basename(folder) token = _upload_file_or_folder(os.path.dirname(folder), file_name, model_type, quiet) if token: - root_dict['files'].append(token) + root_dict["files"].append(token) else: for root, dirs, files in os.walk(folder): - print("dfkafsss") # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) @@ -197,21 +193,21 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] if path != ".": for part in path.split(os.sep): # Find or create the subdirectory in the current dictionary - for subdir in current_dict['directories']: - if subdir['name'] == part: + for subdir in current_dict["directories"]: + if subdir["name"] == part: current_dict = subdir break else: # If the directory is not found, create a new one - new_dir = {'name': part, 'files': [], 'directories': []} - current_dict['directories'].append(new_dir) + new_dir = {"name": part, "files": [], "directories": []} + current_dict["directories"].append(new_dir) current_dict = new_dir # Add file tokens to the current directory in the dictionary for file in files: token = _upload_file_or_folder(root, file, model_type, quiet) if token: - current_dict['files'].append(token) + current_dict["files"].append(token) return root_dict From a1e4c3449f1563b2fc255fdf8b1aa0cac9e3d763 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 15:01:31 +0000 Subject: [PATCH 09/34] r --- src/kagglehub/models_helpers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 60ca5b73..460ff8dc 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -12,7 +12,7 @@ class Directory: name: str files: List[str] - directories: List['Directory'] + directories: List["Directory"] FileStructure = List[Directory] @@ -28,13 +28,13 @@ def _create_model(owner_slug: str, model_slug: str) -> None: def _create_model_instance( model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None ) -> None: - print([subdir for subdir in files_and_directories['directories']]) - print([{"token": file_token} for file_token in files_and_directories['files']]) + print([subdir for subdir in files_and_directories["directories"]]) + print([{"token": file_token} for file_token in files_and_directories["files"]]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, - "files": [{"token": file_token} for file_token in files_and_directories['files']], - "directories": [subdir for subdir in files_and_directories['directories']], + "files": [{"token": file_token} for file_token in files_and_directories["files"]], + "directories": [subdir for subdir in files_and_directories["directories"]], } if license_name is not None: data["licenseName"] = license_name @@ -49,8 +49,8 @@ def _create_model_instance_version( ) -> None: data = { "versionNotes": version_notes, - "files": [{"token": file_token} for file_token in files_and_directories['files']], - "directories": [subdir for subdir in files_and_directories['directories']], + "files": [{"token": file_token} for file_token in files_and_directories["files"]], + "directories": [subdir for subdir in files_and_directories["directories"]], } api_client = KaggleApiV1Client() api_client.post( From 520cad4a58a3e8b54a343bbf8777462a52ab160e Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:40:30 +0000 Subject: [PATCH 10/34] r --- src/kagglehub/gcs_upload.py | 45 +++++++++++++++------------------ src/kagglehub/models.py | 4 +-- src/kagglehub/models_helpers.py | 30 ++++++++++------------ 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index ab42a2c7..68e2f4b8 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -23,16 +23,18 @@ class UploadFileInfo: - token: str + def __init__(self, token: str): + self.token = token -class Directory: - name: str - files: List[UploadFileInfo] - directories: List["Directory"] +class UploadDirectoryInfo: + def __init__(self, name: str, files: List[UploadFileInfo] = None, directories: List['UploadDirectoryInfo'] = None): + self.name = name + self.files = files if files is not None else [] + self.directories = directories if directories is not None else [] -FileStructure = Union[UploadFileInfo, Directory] +FileStructure = Union[UploadFileInfo, UploadDirectoryInfo] def parse_datetime_string(string: str) -> Union[datetime, str]: @@ -148,7 +150,9 @@ def _upload_blob(file_path: str, model_type: str) -> str: return response["token"] -def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str]: # noqa: FBT002, FBT001 +def upload_files_and_directories( + folder: str, model_type: str, quiet: bool = False +) -> UploadDirectoryInfo: # noqa: FBT002, FBT001 # Count the total number of files file_count = 0 for _, _, files in os.walk(folder): @@ -166,23 +170,16 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder)) - # Upload the zip file - return { - "files": [ - token - for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] - if token is not None - ], - "directories": [], - } - - root_dict = {'files': [], 'directories': []} + tokens = _upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet) + return UploadDirectoryInfo(name="archive", files=tokens) + + root_dict = UploadDirectoryInfo(name="root") if os.path.isfile(folder): # Directly upload the file if the path is a file file_name = os.path.basename(folder) token = _upload_file_or_folder(os.path.dirname(folder), file_name, model_type, quiet) if token: - root_dict["files"].append(token) + root_dict.files.append(token) else: for root, dirs, files in os.walk(folder): # Path of the current folder relative to the base folder @@ -193,21 +190,21 @@ def upload_files(folder: str, model_type: str, quiet: bool = False) -> List[str] if path != ".": for part in path.split(os.sep): # Find or create the subdirectory in the current dictionary - for subdir in current_dict["directories"]: - if subdir["name"] == part: + for subdir in current_dict.directories: + if subdir.name == part: current_dict = subdir break else: # If the directory is not found, create a new one - new_dir = {"name": part, "files": [], "directories": []} - current_dict["directories"].append(new_dir) + new_dir = UploadDirectoryInfo(name=part) + current_dict.directories.append(new_dir) current_dict = new_dir # Add file tokens to the current directory in the dictionary for file in files: token = _upload_file_or_folder(root, file, model_type, quiet) if token: - current_dict["files"].append(token) + current_dict.files.append(token) return root_dict diff --git a/src/kagglehub/models.py b/src/kagglehub/models.py index d5e3d5e9..bfa5dd98 100644 --- a/src/kagglehub/models.py +++ b/src/kagglehub/models.py @@ -2,7 +2,7 @@ from typing import Optional from kagglehub import registry -from kagglehub.gcs_upload import upload_files +from kagglehub.gcs_upload import upload_files_and_directories from kagglehub.handle import parse_model_handle from kagglehub.models_helpers import create_model_if_missing, create_model_instance_or_version @@ -47,7 +47,7 @@ def model_upload( create_model_if_missing(h.owner, h.model) # Upload the model files to GCS - tokens = upload_files(local_model_dir, "model") + tokens = upload_files_and_directories(local_model_dir, "model") # Create a model instance if it doesn't exist, and create a new instance version if an instance exists create_model_instance_or_version(h, tokens, license_name, version_notes) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 460ff8dc..1b65f0cd 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -5,19 +5,11 @@ from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError from kagglehub.handle import ModelHandle +from kagglehub.gcs_upload import FileStructure logger = logging.getLogger(__name__) -class Directory: - name: str - files: List[str] - directories: List["Directory"] - - -FileStructure = List[Directory] - - def _create_model(owner_slug: str, model_slug: str) -> None: data = {"ownerSlug": owner_slug, "slug": model_slug, "title": model_slug, "isPrivate": True} api_client = KaggleApiV1Client() @@ -28,13 +20,15 @@ def _create_model(owner_slug: str, model_slug: str) -> None: def _create_model_instance( model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None ) -> None: - print([subdir for subdir in files_and_directories["directories"]]) - print([{"token": file_token} for file_token in files_and_directories["files"]]) + serialized_data = [ + {'name': d.name, 'files': [{'token': file} for file in d.files], 'directories': d.directories} + for d in files_and_directories.directories + ] data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, - "files": [{"token": file_token} for file_token in files_and_directories["files"]], - "directories": [subdir for subdir in files_and_directories["directories"]], + "files": [{"token": file_token} for file_token in files_and_directories.files], + "directories": serialized_data, } if license_name is not None: data["licenseName"] = license_name @@ -45,12 +39,16 @@ def _create_model_instance( def _create_model_instance_version( - model_handle: ModelHandle, files_and_directories: List[str], version_notes: str = "" + model_handle: ModelHandle, files_and_directories: FileStructure, version_notes: str = "" ) -> None: + serialized_data = [ + {'name': d.name, 'files': [{'token': file} for file in d.files], 'directories': d.directories} + for d in files_and_directories.directories + ] data = { "versionNotes": version_notes, - "files": [{"token": file_token} for file_token in files_and_directories["files"]], - "directories": [subdir for subdir in files_and_directories["directories"]], + "files": [{"token": file_token} for file_token in files_and_directories.files], + "directories": serialized_data, } api_client = KaggleApiV1Client() api_client.post( From c607c64dceb246c4ea6de346501d179a17b8093c Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:47:34 +0000 Subject: [PATCH 11/34] r --- src/kagglehub/gcs_upload.py | 8 ++++---- src/kagglehub/models_helpers.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 68e2f4b8..68f80cdd 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -28,7 +28,7 @@ def __init__(self, token: str): class UploadDirectoryInfo: - def __init__(self, name: str, files: List[UploadFileInfo] = None, directories: List['UploadDirectoryInfo'] = None): + def __init__(self, name: str, files: List[UploadFileInfo] = None, directories: List["UploadDirectoryInfo"] = None): self.name = name self.files = files if files is not None else [] self.directories = directories if directories is not None else [] @@ -151,8 +151,8 @@ def _upload_blob(file_path: str, model_type: str) -> str: def upload_files_and_directories( - folder: str, model_type: str, quiet: bool = False -) -> UploadDirectoryInfo: # noqa: FBT002, FBT001 + folder: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 +) -> UploadDirectoryInfo: # Count the total number of files file_count = 0 for _, _, files in os.walk(folder): @@ -181,7 +181,7 @@ def upload_files_and_directories( if token: root_dict.files.append(token) else: - for root, dirs, files in os.walk(folder): + for root, _, files in os.walk(folder): # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 1b65f0cd..50866b78 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -21,7 +21,7 @@ def _create_model_instance( model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None ) -> None: serialized_data = [ - {'name': d.name, 'files': [{'token': file} for file in d.files], 'directories': d.directories} + {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] data = { @@ -42,7 +42,7 @@ def _create_model_instance_version( model_handle: ModelHandle, files_and_directories: FileStructure, version_notes: str = "" ) -> None: serialized_data = [ - {'name': d.name, 'files': [{'token': file} for file in d.files], 'directories': d.directories} + {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] data = { From b7b31ad5c21f65ecbb9d9e20581d44c51154b3d5 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:48:15 +0000 Subject: [PATCH 12/34] r --- src/kagglehub/gcs_upload.py | 2 +- src/kagglehub/models_helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 68f80cdd..158d7965 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -151,7 +151,7 @@ def _upload_blob(file_path: str, model_type: str) -> str: def upload_files_and_directories( - folder: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 + folder: str, model_type: str, quiet: bool = False # noqa: FBT002, FBT001 ) -> UploadDirectoryInfo: # Count the total number of files file_count = 0 diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 50866b78..d40fba24 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -1,6 +1,6 @@ import logging from http import HTTPStatus -from typing import List, Optional, Union +from typing import Optional from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError From 777de49770bca0ff77cf32682a59c3e42aee1cc9 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:50:22 +0000 Subject: [PATCH 13/34] r --- src/kagglehub/gcs_upload.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 158d7965..4ef9264a 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -28,7 +28,12 @@ def __init__(self, token: str): class UploadDirectoryInfo: - def __init__(self, name: str, files: List[UploadFileInfo] = None, directories: List["UploadDirectoryInfo"] = None): + def __init__( + self, + name: str, + files: Optional[List["UploadFileInfo"]] = None, + directories: Optional[List["UploadDirectoryInfo"]] = None, + ): self.name = name self.files = files if files is not None else [] self.directories = directories if directories is not None else [] From 4085044456041cec3bf238dbdb4ee6b00b2ddcda Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:52:20 +0000 Subject: [PATCH 14/34] r --- src/kagglehub/models_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index d40fba24..cf225f31 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -4,8 +4,8 @@ from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError -from kagglehub.handle import ModelHandle from kagglehub.gcs_upload import FileStructure +from kagglehub.handle import ModelHandle logger = logging.getLogger(__name__) From 9b7050fc51bd03e0ad73a30690dbb468c094b325 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 18:58:08 +0000 Subject: [PATCH 15/34] r --- src/kagglehub/gcs_upload.py | 9 ++------- src/kagglehub/models_helpers.py | 5 +++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 4ef9264a..05d33cac 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -22,16 +22,11 @@ REQUEST_TIMEOUT = 600 -class UploadFileInfo: - def __init__(self, token: str): - self.token = token - - class UploadDirectoryInfo: def __init__( self, name: str, - files: Optional[List["UploadFileInfo"]] = None, + files: Optional[List[str]] = None, directories: Optional[List["UploadDirectoryInfo"]] = None, ): self.name = name @@ -39,7 +34,7 @@ def __init__( self.directories = directories if directories is not None else [] -FileStructure = Union[UploadFileInfo, UploadDirectoryInfo] +FileStructure = List[UploadDirectoryInfo] def parse_datetime_string(string: str) -> Union[datetime, str]: diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index cf225f31..1cba7c1a 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -24,11 +24,12 @@ def _create_model_instance( {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] + print([{"token": file_token} for file_token in files_and_directories.files]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data, + # "directories": serialized_data, } if license_name is not None: data["licenseName"] = license_name @@ -48,7 +49,7 @@ def _create_model_instance_version( data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data, + # "directories": serialized_data, } api_client = KaggleApiV1Client() api_client.post( From 823eda307d8f4b96e64a436cedb62b24e783712b Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 19:01:41 +0000 Subject: [PATCH 16/34] r --- src/kagglehub/gcs_upload.py | 6 +++++- src/kagglehub/models_helpers.py | 5 ++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 05d33cac..64fc3dec 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -170,7 +170,11 @@ def upload_files_and_directories( file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder)) - tokens = _upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet) + tokens = [ + token + for token in [_upload_file_or_folder(temp_dir, TEMP_ARCHIVE_FILE, model_type, quiet)] + if token is not None + ] return UploadDirectoryInfo(name="archive", files=tokens) root_dict = UploadDirectoryInfo(name="root") diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 1cba7c1a..cf225f31 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -24,12 +24,11 @@ def _create_model_instance( {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] - print([{"token": file_token} for file_token in files_and_directories.files]) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories.files], - # "directories": serialized_data, + "directories": serialized_data, } if license_name is not None: data["licenseName"] = license_name @@ -49,7 +48,7 @@ def _create_model_instance_version( data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], - # "directories": serialized_data, + "directories": serialized_data, } api_client = KaggleApiV1Client() api_client.post( From 05deed5e5b1fb3ad688f47a3a3db02ed7c59b64d Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 19:07:28 +0000 Subject: [PATCH 17/34] r --- src/kagglehub/gcs_upload.py | 3 --- src/kagglehub/models_helpers.py | 9 +++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 64fc3dec..130943ac 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -34,9 +34,6 @@ def __init__( self.directories = directories if directories is not None else [] -FileStructure = List[UploadDirectoryInfo] - - def parse_datetime_string(string: str) -> Union[datetime, str]: time_formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%fZ"] for t in time_formats: diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index cf225f31..c115ea66 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -4,8 +4,8 @@ from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError -from kagglehub.gcs_upload import FileStructure from kagglehub.handle import ModelHandle +from kagglehub.gcs_upload import UploadDirectoryInfo logger = logging.getLogger(__name__) @@ -18,12 +18,13 @@ def _create_model(owner_slug: str, model_slug: str) -> None: def _create_model_instance( - model_handle: ModelHandle, files_and_directories: FileStructure, license_name: Optional[str] = None + model_handle: ModelHandle, files_and_directories: UploadDirectoryInfo, license_name: Optional[str] = None ) -> None: serialized_data = [ {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] + print(files_and_directories) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, @@ -39,7 +40,7 @@ def _create_model_instance( def _create_model_instance_version( - model_handle: ModelHandle, files_and_directories: FileStructure, version_notes: str = "" + model_handle: ModelHandle, files_and_directories: UploadDirectoryInfo, version_notes: str = "" ) -> None: serialized_data = [ {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} @@ -62,7 +63,7 @@ def _create_model_instance_version( def create_model_instance_or_version( model_handle: ModelHandle, - files_and_directories: FileStructure, + files_and_directories: UploadDirectoryInfo, license_name: Optional[str], version_notes: str = "", ) -> None: From b724c7d2ee203da7b52fc2c4e366a1151c662783 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 19:09:57 +0000 Subject: [PATCH 18/34] r --- src/kagglehub/models_helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index c115ea66..b02cb2f3 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -24,7 +24,6 @@ def _create_model_instance( {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} for d in files_and_directories.directories ] - print(files_and_directories) data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, From afd1095e80e372218c6e3295ac1638bf17070bce Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 19:13:44 +0000 Subject: [PATCH 19/34] r --- src/kagglehub/models_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index b02cb2f3..a41a8942 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -4,8 +4,8 @@ from kagglehub.clients import KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError -from kagglehub.handle import ModelHandle from kagglehub.gcs_upload import UploadDirectoryInfo +from kagglehub.handle import ModelHandle logger = logging.getLogger(__name__) From 6d379411117e8aad186349fa229e8b1e20cb8be3 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 20:41:42 +0000 Subject: [PATCH 20/34] r --- src/kagglehub/models_helpers.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index a41a8942..42fbc662 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -2,7 +2,7 @@ from http import HTTPStatus from typing import Optional -from kagglehub.clients import KaggleApiV1Client +from kagglehub.clients import BackendError, KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError from kagglehub.gcs_upload import UploadDirectoryInfo from kagglehub.handle import ModelHandle @@ -71,12 +71,10 @@ def create_model_instance_or_version( api_client.get(f"/models/{model_handle}/get", model_handle) # the instance exist, create a new version. _create_model_instance_version(model_handle, files_and_directories, version_notes) - except KaggleApiHTTPError as e: - if e.response is not None and ( - e.response.status_code == HTTPStatus.NOT_FOUND # noqa: PLR1714 - or e.response.status_code == HTTPStatus.FORBIDDEN - ): - _create_model_instance(model_handle, files_and_directories, license_name) + except BackendError as e: + if e.error_code == HTTPStatus.CONFLICT: + # Instance already exist, creating a new version instead. + _create_model_instance_version(model_handle, files, version_notes) else: raise (e) From ff1f5949d3ca327e206b1157dc2eb2c44e2c532b Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 20:43:31 +0000 Subject: [PATCH 21/34] r --- src/kagglehub/models_helpers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 42fbc662..2f7d0fed 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -63,7 +63,6 @@ def _create_model_instance_version( def create_model_instance_or_version( model_handle: ModelHandle, files_and_directories: UploadDirectoryInfo, - license_name: Optional[str], version_notes: str = "", ) -> None: try: @@ -74,7 +73,7 @@ def create_model_instance_or_version( except BackendError as e: if e.error_code == HTTPStatus.CONFLICT: # Instance already exist, creating a new version instead. - _create_model_instance_version(model_handle, files, version_notes) + _create_model_instance_version(model_handle, files_and_directories, version_notes) else: raise (e) From 008e55a1ec8af197c1848aa6da974f61bff02685 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 20:53:12 +0000 Subject: [PATCH 22/34] r --- src/kagglehub/models_helpers.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 2f7d0fed..2dfe46bf 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -1,6 +1,6 @@ import logging from http import HTTPStatus -from typing import Optional +from typing import List, Optional from kagglehub.clients import BackendError, KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError @@ -61,19 +61,14 @@ def _create_model_instance_version( def create_model_instance_or_version( - model_handle: ModelHandle, - files_and_directories: UploadDirectoryInfo, - version_notes: str = "", + model_handle: ModelHandle, files: List[str], license_name: Optional[str], version_notes: str = "" ) -> None: try: - api_client = KaggleApiV1Client() - api_client.get(f"/models/{model_handle}/get", model_handle) - # the instance exist, create a new version. - _create_model_instance_version(model_handle, files_and_directories, version_notes) + _create_model_instance(model_handle, files, license_name) except BackendError as e: if e.error_code == HTTPStatus.CONFLICT: # Instance already exist, creating a new version instead. - _create_model_instance_version(model_handle, files_and_directories, version_notes) + _create_model_instance_version(model_handle, files, version_notes) else: raise (e) From eaf095c7670bdb13712cb012fb22b1d7ec844615 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 20:57:15 +0000 Subject: [PATCH 23/34] r --- src/kagglehub/models_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 2dfe46bf..42e013ba 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -61,7 +61,7 @@ def _create_model_instance_version( def create_model_instance_or_version( - model_handle: ModelHandle, files: List[str], license_name: Optional[str], version_notes: str = "" + model_handle: ModelHandle, files: UploadDirectoryInfo, license_name: Optional[str], version_notes: str = "" ) -> None: try: _create_model_instance(model_handle, files, license_name) From 4dc0f499c908b3064ba840e212812b1b7fbd557b Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Wed, 24 Apr 2024 21:00:48 +0000 Subject: [PATCH 24/34] r --- src/kagglehub/models_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 42e013ba..82cc8050 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -1,6 +1,6 @@ import logging from http import HTTPStatus -from typing import List, Optional +from typing import Optional from kagglehub.clients import BackendError, KaggleApiV1Client from kagglehub.exceptions import KaggleApiHTTPError From d40061caaa5ee3a9ee86cb9f115ef3c77d74176d Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 17:49:32 +0000 Subject: [PATCH 25/34] r --- src/kagglehub/gcs_upload.py | 11 ++++++++--- src/kagglehub/models_helpers.py | 10 ++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 130943ac..33aeacb8 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -4,7 +4,7 @@ import zipfile from datetime import datetime from tempfile import TemporaryDirectory -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import requests from requests.exceptions import ConnectionError, Timeout @@ -33,6 +33,13 @@ def __init__( self.files = files if files is not None else [] self.directories = directories if directories is not None else [] + def serialize(self) -> Dict: + return { + "name": self.name, + "files": [{"token": file} for file in self.files], + "directories": [directory.serialize() for directory in self.directories] + } + def parse_datetime_string(string: str) -> Union[datetime, str]: time_formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%fZ"] @@ -230,8 +237,6 @@ def _upload_file_or_folder( full_path = os.path.join(parent_path, file_or_folder_name) if os.path.isfile(full_path): return _upload_file(file_or_folder_name, full_path, quiet, model_type) - elif not quiet: - logger.info("Skipping: " + file_or_folder_name) return None diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 82cc8050..5985fffd 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -20,10 +20,7 @@ def _create_model(owner_slug: str, model_slug: str) -> None: def _create_model_instance( model_handle: ModelHandle, files_and_directories: UploadDirectoryInfo, license_name: Optional[str] = None ) -> None: - serialized_data = [ - {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} - for d in files_and_directories.directories - ] + serialized_data = files_and_directories.serialize() data = { "instanceSlug": model_handle.variation, "framework": model_handle.framework, @@ -41,10 +38,7 @@ def _create_model_instance( def _create_model_instance_version( model_handle: ModelHandle, files_and_directories: UploadDirectoryInfo, version_notes: str = "" ) -> None: - serialized_data = [ - {"name": d.name, "files": [{"token": file} for file in d.files], "directories": d.directories} - for d in files_and_directories.directories - ] + serialized_data = files_and_directories.serialize() data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], From 6d896f09ed16d1f5c135f5df3b4104426ab331c3 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 18:44:02 +0000 Subject: [PATCH 26/34] r --- integration_tests/test_model_upload.py | 16 ++++++++++++++++ src/kagglehub/gcs_upload.py | 2 +- tests/test_model_upload.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/integration_tests/test_model_upload.py b/integration_tests/test_model_upload.py index 53c24d23..2728fe98 100644 --- a/integration_tests/test_model_upload.py +++ b/integration_tests/test_model_upload.py @@ -102,6 +102,22 @@ def test_model_upload_directory(self) -> None: # Create Version model_upload(self.handle, temp_dir, LICENSE_NAME) + def test_model_upload_directory_structure(self) -> None: + nested_dir = Path(self.temp_dir) / "nested" + nested_dir.mkdir() + + with open(Path(self.temp_dir) / "file1.txt", "w") as f: + f.write("dummy content in nested file") + + # Create dummy files in the nested directory + nested_dummy_files = ["nested_model.h5", "nested_config.json", "nested_metadata.json"] + for file in nested_dummy_files: + with open(nested_dir / file, "w") as f: + f.write("dummy content in nested file") + + # Call the model upload function with the base directory + model_upload(self.handle, self.temp_dir, LICENSE_NAME) + def test_model_upload_nested_dir(self) -> None: # Create a nested directory within self.temp_dir nested_dir = Path(self.temp_dir) / "nested" diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 33aeacb8..d3129168 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -37,7 +37,7 @@ def serialize(self) -> Dict: return { "name": self.name, "files": [{"token": file} for file in self.files], - "directories": [directory.serialize() for directory in self.directories] + "directories": [directory.serialize() for directory in self.directories], } diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index 22c52868..a2401f58 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -50,6 +50,7 @@ def do_POST(self) -> None: # noqa: N802 content_length = int(self.headers["Content-Length"]) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode("utf-8")) + print(data) # Extracting the 'name' from the data name = data.get("name", None) @@ -244,3 +245,25 @@ def test_single_file_upload(self) -> None: self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) self.assertIn("single_dummy_file.txt", KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) + + def test_model_upload_with_directory_structure(self): + with create_test_http_server(KaggleAPIHandler): + with create_test_http_server(GcsAPIHandler, "http://localhost:7778"): + with TemporaryDirectory() as temp_dir: + base_path = Path(temp_dir) + (base_path / "dir1").mkdir() + (base_path / "dir2").mkdir() + + (base_path / "file1.txt").touch() + + (base_path / "dir1" / "file2.txt").touch() + (base_path / "dir1" / "file3.txt").touch() + + (base_path / "dir1" / "subdir1").mkdir() + (base_path / "dir1" / "subdir1" / "file4.txt").touch() + + model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, APACHE_LICENSE, "model_type") + + self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 0) + expected_files = {"file1.txt", "ile2.txt", "file3.txt", "file4.txt"} + self.assertTrue(set(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES).issubset(expected_files)) From 7736410ec6ebbe5dfb0e860fdfe810e856220dd7 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 18:44:38 +0000 Subject: [PATCH 27/34] r --- tests/test_model_upload.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index a2401f58..6128644c 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -50,7 +50,6 @@ def do_POST(self) -> None: # noqa: N802 content_length = int(self.headers["Content-Length"]) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode("utf-8")) - print(data) # Extracting the 'name' from the data name = data.get("name", None) From cb54f48d2d166cfe66d1b385fd2ce6a157bc3974 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 18:46:44 +0000 Subject: [PATCH 28/34] r --- tests/test_model_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index 6128644c..ed9a5146 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -245,7 +245,7 @@ def test_single_file_upload(self) -> None: self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 1) self.assertIn("single_dummy_file.txt", KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES) - def test_model_upload_with_directory_structure(self): + def test_model_upload_with_directory_structure(self) -> None: with create_test_http_server(KaggleAPIHandler): with create_test_http_server(GcsAPIHandler, "http://localhost:7778"): with TemporaryDirectory() as temp_dir: From e9aff93e5239cce5192893265e418a80033c164c Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 19:24:18 +0000 Subject: [PATCH 29/34] ir --- tests/test_model_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index ed9a5146..733e13d1 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -263,6 +263,6 @@ def test_model_upload_with_directory_structure(self) -> None: model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, APACHE_LICENSE, "model_type") - self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 0) + self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 4) expected_files = {"file1.txt", "ile2.txt", "file3.txt", "file4.txt"} self.assertTrue(set(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES).issubset(expected_files)) From 490813fd18df9eadead44c0abe738d5a991e8e2a Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 19:27:04 +0000 Subject: [PATCH 30/34] r --- tests/test_model_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index 733e13d1..36708aa3 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -264,5 +264,5 @@ def test_model_upload_with_directory_structure(self) -> None: model_upload("metaresearch/new-model/pyTorch/new-variation", temp_dir, APACHE_LICENSE, "model_type") self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 4) - expected_files = {"file1.txt", "ile2.txt", "file3.txt", "file4.txt"} + expected_files = {"file1.txt", "file2.txt", "file3.txt", "file4.txt"} self.assertTrue(set(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES).issubset(expected_files)) From 1afe0fcf35555bbb468738eb3f838c91a88b2c20 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 20:13:42 +0000 Subject: [PATCH 31/34] r --- tests/test_model_upload.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_model_upload.py b/tests/test_model_upload.py index 36708aa3..ccfb1988 100644 --- a/tests/test_model_upload.py +++ b/tests/test_model_upload.py @@ -266,3 +266,7 @@ def test_model_upload_with_directory_structure(self) -> None: self.assertEqual(len(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES), 4) expected_files = {"file1.txt", "file2.txt", "file3.txt", "file4.txt"} self.assertTrue(set(KaggleAPIHandler.UPLOAD_BLOB_FILE_NAMES).issubset(expected_files)) + + # TODO: Add assertions on CreateModelInstanceRequest.Directories and + # CreateModelInstanceRequest.Files to verify the expected structure + # is sent. From b2eb30f90bfe54d3b90f754d38ee381abe457bf2 Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 20:52:05 +0000 Subject: [PATCH 32/34] r --- src/kagglehub/models_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index 5985fffd..f92239b1 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -25,7 +25,7 @@ def _create_model_instance( "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data, + "directories": [serialized_data], } if license_name is not None: data["licenseName"] = license_name @@ -42,7 +42,7 @@ def _create_model_instance_version( data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data, + "directories": [serialized_data], } api_client = KaggleApiV1Client() api_client.post( From 500cd3b0d132da3a44d69df01f58d09524bdd6aa Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 21:06:59 +0000 Subject: [PATCH 33/34] r --- src/kagglehub/models_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index f92239b1..a787e57c 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -25,7 +25,7 @@ def _create_model_instance( "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": [serialized_data], + "directories": serialized_data['directories'], } if license_name is not None: data["licenseName"] = license_name @@ -42,7 +42,7 @@ def _create_model_instance_version( data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": [serialized_data], + "directories": serialized_data['directories'], } api_client = KaggleApiV1Client() api_client.post( From ba688d11d3b103daa8d87ba850a3b5120585715c Mon Sep 17 00:00:00 2001 From: Mohamed Amin Date: Thu, 25 Apr 2024 21:26:10 +0000 Subject: [PATCH 34/34] r --- src/kagglehub/models_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kagglehub/models_helpers.py b/src/kagglehub/models_helpers.py index a787e57c..45efed79 100644 --- a/src/kagglehub/models_helpers.py +++ b/src/kagglehub/models_helpers.py @@ -25,7 +25,7 @@ def _create_model_instance( "instanceSlug": model_handle.variation, "framework": model_handle.framework, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data['directories'], + "directories": serialized_data["directories"], } if license_name is not None: data["licenseName"] = license_name @@ -42,7 +42,7 @@ def _create_model_instance_version( data = { "versionNotes": version_notes, "files": [{"token": file_token} for file_token in files_and_directories.files], - "directories": serialized_data['directories'], + "directories": serialized_data["directories"], } api_client = KaggleApiV1Client() api_client.post(