From e140f6abae742c4b793b2fa635cc4b5a23f22b35 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Thu, 4 Sep 2025 17:08:27 +0000
Subject: [PATCH 01/33] Add support for direct upload to r2 buckets

---
 src/litdata/constants.py             |   2 +-
 src/litdata/streaming/fs_provider.py | 155 +++++++++++++++++++++++++++
 tests/streaming/test_fs_provider.py  |   8 ++
 3 files changed, 164 insertions(+), 1 deletion(-)

diff --git a/src/litdata/constants.py b/src/litdata/constants.py
index 4159f53eb..f0ff7980a 100644
--- a/src/litdata/constants.py
+++ b/src/litdata/constants.py
@@ -24,7 +24,7 @@
 _DEFAULT_CACHE_DIR = os.path.join(Path.home(), ".lightning", "chunks")
 _DEFAULT_LIGHTNING_CACHE_DIR = os.path.join("/cache", "chunks")
 _LITDATA_CACHE_DIR = os.getenv("LITDATA_CACHE_DIR", None)
-_SUPPORTED_PROVIDERS = ("s3", "gs")  # cloud providers supported by litdata for uploading (optimize, map, merge, etc)
+_SUPPORTED_PROVIDERS = ("s3", "gs", "r2")  # cloud providers supported by litdata for uploading (optimize, map, merge, etc)
 
 # This is required for full pytree serialization / deserialization support
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index f9263faaf..47715229e 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -223,6 +223,159 @@ def is_empty(self, path: str) -> bool:
 
         return not objects["KeyCount"] > 0
 
+class R2FsProvider(FsProvider):
+    def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
+        super().__init__(storage_options=storage_options)
+
+        # Get data connection ID from environment variable (set by resolver)
+        data_connection_id = os.getenv("LIGHTNING_DATA_CONNECTION_ID")
+        
+        # Fetch R2 credentials from the Lightning platform and add them to the storage options
+        r2_credentials = self.get_r2_bucket_credentials(data_connection_id=data_connection_id)
+        storage_options = {**storage_options, **r2_credentials}
+        
+        self.client = S3Client(storage_options=storage_options)
+    
+    def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
+        """
+        Fetch temporary R2 credentials for the current lightning storage connection.
+        """
+        import json
+        import requests
+        
+        try:
+            # Get Lightning Cloud API token
+            cloud_url = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
+            api_key = os.getenv("LIGHTNING_API_KEY")
+            username = os.getenv("LIGHTNING_USERNAME")
+            project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID")
+            
+            if not all([api_key, username, project_id]):
+                raise RuntimeError("Missing required environment variables")
+                
+            # Login to get token
+            payload = {"apiKey": api_key, "username": username}
+            login_url = f"{cloud_url}/v1/auth/login"
+            response = requests.post(login_url, data=json.dumps(payload))
+            
+            if "token" not in response.json():
+                raise RuntimeError("Failed to get authentication token")
+                
+            token = response.json()["token"]
+            
+            # Get temporary bucket credentials
+            headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+            credentials_url = f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
+            
+            credentials_response = requests.get(credentials_url, headers=headers)
+            
+            if credentials_response.status_code != 200:
+                raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")
+                
+            temp_credentials = credentials_response.json()
+
+            endpoint_url = f"https://{temp_credentials['accountId']}.r2.cloudflarestorage.com"
+            
+            # Format credentials for S3Client
+            return {
+                "aws_access_key_id": temp_credentials["accessKeyId"],
+                "aws_secret_access_key": temp_credentials["secretAccessKey"],
+                "aws_session_token": temp_credentials["sessionToken"],
+                "endpoint_url": endpoint_url,
+                "region_name": "auto"
+            }
+            
+        except Exception as e:
+            # Fallback to hardcoded credentials if API call fails
+            print(f"Failed to get R2 credentials from API: {e}. Using fallback credentials.")
+            raise RuntimeError(f"Failed to get R2 credentials and no fallback available: {e}")
+
+    def upload_file(self, local_path: str, remote_path: str) -> None:
+        bucket_name, blob_path = get_bucket_and_path(remote_path, "r2")
+        self.client.client.upload_file(local_path, bucket_name, blob_path)
+
+    def download_file(self, remote_path: str, local_path: str) -> None:
+        bucket_name, blob_path = get_bucket_and_path(remote_path, "r2")
+        with open(local_path, "wb") as f:
+            self.client.client.download_fileobj(bucket_name, blob_path, f)
+
+    def download_directory(self, remote_path: str, local_directory_name: str) -> str:
+        """Download all objects under a given S3 prefix (directory) using the existing client."""
+        bucket_name, remote_directory_name = get_bucket_and_path(remote_path, "r2")
+
+        # Ensure local directory exists
+        local_directory_name = os.path.abspath(local_directory_name)
+        os.makedirs(local_directory_name, exist_ok=True)
+
+        saved_file_dir = "."
+
+        # List objects under the given prefix
+        objects = self.client.client.list_objects_v2(Bucket=bucket_name, Prefix=remote_directory_name)
+
+        # Check if objects exist
+        if "Contents" in objects:
+            for obj in objects["Contents"]:
+                local_filename = os.path.join(local_directory_name, obj["Key"])
+
+                # Ensure parent directories exist
+                os.makedirs(os.path.dirname(local_filename), exist_ok=True)
+
+                # Download each file
+                with open(local_filename, "wb") as f:
+                    self.client.client.download_fileobj(bucket_name, obj["Key"], f)
+                    saved_file_dir = os.path.dirname(local_filename)
+
+        return saved_file_dir
+
+    def copy(self, remote_source: str, remote_destination: str) -> None:
+        input_obj = parse.urlparse(remote_source)
+        output_obj = parse.urlparse(remote_destination)
+        self.client.client.copy(
+            {"Bucket": input_obj.netloc, "Key": input_obj.path.lstrip("/")},
+            output_obj.netloc,
+            output_obj.path.lstrip("/"),
+        )
+
+    def list_directory(self, path: str) -> list[str]:
+        raise NotImplementedError
+
+    def delete_file_or_directory(self, path: str) -> None:
+        """Delete the file or the directory."""
+        bucket_name, blob_path = get_bucket_and_path(path, "r2")
+
+        # List objects under the given path
+        objects = self.client.client.list_objects_v2(Bucket=bucket_name, Prefix=blob_path)
+
+        # Check if objects exist
+        if "Contents" in objects:
+            for obj in objects["Contents"]:
+                self.client.client.delete_object(Bucket=bucket_name, Key=obj["Key"])
+
+    def exists(self, path: str) -> bool:
+        import botocore
+
+        bucket_name, blob_path = get_bucket_and_path(path, "r2")
+        try:
+            _ = self.client.client.head_object(Bucket=bucket_name, Key=blob_path)
+            return True
+        except botocore.exceptions.ClientError as e:
+            if "the HeadObject operation: Not Found" in str(e):
+                return False
+            raise e
+        except Exception as e:
+            raise e
+
+    def is_empty(self, path: str) -> bool:
+        obj = parse.urlparse(path)
+
+        objects = self.client.client.list_objects_v2(
+            Bucket=obj.netloc,
+            Delimiter="/",
+            Prefix=obj.path.lstrip("/").rstrip("/") + "/",
+        )
+
+        return not objects["KeyCount"] > 0
+
 
 def get_bucket_and_path(remote_filepath: str, expected_scheme: str = "s3") -> tuple[str, str]:
     """Parse the remote filepath and return the bucket name and the blob path.
@@ -259,6 +412,8 @@ def _get_fs_provider(remote_filepath: str, storage_options: Optional[dict[str, A
         return GCPFsProvider(storage_options=storage_options)
     if obj.scheme == "s3":
         return S3FsProvider(storage_options=storage_options)
+    if obj.scheme == "r2":
+        return R2FsProvider(storage_options=storage_options)
     raise ValueError(f"Unsupported scheme: {obj.scheme}")
 
 
diff --git a/tests/streaming/test_fs_provider.py b/tests/streaming/test_fs_provider.py
index 1cf924df4..97958e1bc 100644
--- a/tests/streaming/test_fs_provider.py
+++ b/tests/streaming/test_fs_provider.py
@@ -6,6 +6,7 @@
 from litdata.streaming.fs_provider import (
     GCPFsProvider,
     S3FsProvider,
+    R2FsProvider,
     _get_fs_provider,
     get_bucket_and_path,
     not_supported_provider,
@@ -25,6 +26,10 @@ def test_get_bucket_and_path():
     assert bucket == "bucket"
     assert path == "path/to/file.txt"
 
+    bucket, path = get_bucket_and_path("r2://bucket/path/to/file.txt", "r2")
+    assert bucket == "bucket"
+    assert path == "path/to/file.txt"
+
 
 def test_get_fs_provider(monkeypatch, google_mock):
     google_mock.cloud.storage.Client = Mock()
@@ -37,6 +42,9 @@ def test_get_fs_provider(monkeypatch, google_mock):
     fs_provider = _get_fs_provider("gs://bucket/path/to/file.txt")
     assert isinstance(fs_provider, GCPFsProvider)
 
+    fs_provider = _get_fs_provider("r2://bucket/path/to/file.txt")
+    assert isinstance(fs_provider, R2FsProvider)
+
     with pytest.raises(ValueError, match="Unsupported scheme"):
         _get_fs_provider("http://bucket/path/to/file.txt")
 

From 32efe93f6827daf55186ae9ee202c0abad012c04 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 4 Sep 2025 17:42:13 +0000
Subject: [PATCH 02/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/litdata/constants.py             |  6 ++++-
 src/litdata/streaming/fs_provider.py | 40 +++++++++++++++-------------
 tests/streaming/test_fs_provider.py  |  2 +-
 3 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/litdata/constants.py b/src/litdata/constants.py
index f0ff7980a..086f7126d 100644
--- a/src/litdata/constants.py
+++ b/src/litdata/constants.py
@@ -24,7 +24,11 @@
 _DEFAULT_CACHE_DIR = os.path.join(Path.home(), ".lightning", "chunks")
 _DEFAULT_LIGHTNING_CACHE_DIR = os.path.join("/cache", "chunks")
 _LITDATA_CACHE_DIR = os.getenv("LITDATA_CACHE_DIR", None)
-_SUPPORTED_PROVIDERS = ("s3", "gs", "r2")  # cloud providers supported by litdata for uploading (optimize, map, merge, etc)
+_SUPPORTED_PROVIDERS = (
+    "s3",
+    "gs",
+    "r2",
+)  # cloud providers supported by litdata for uploading (optimize, map, merge, etc)
 
 # This is required for full pytree serialization / deserialization support
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index 47715229e..a9462430b 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -223,68 +223,70 @@ def is_empty(self, path: str) -> bool:
 
         return not objects["KeyCount"] > 0
 
+
 class R2FsProvider(FsProvider):
     def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
         super().__init__(storage_options=storage_options)
 
         # Get data connection ID from environment variable (set by resolver)
         data_connection_id = os.getenv("LIGHTNING_DATA_CONNECTION_ID")
-        
+
         # Fetch R2 credentials from the Lightning platform and add them to the storage options
         r2_credentials = self.get_r2_bucket_credentials(data_connection_id=data_connection_id)
         storage_options = {**storage_options, **r2_credentials}
-        
+
         self.client = S3Client(storage_options=storage_options)
-    
+
     def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
-        """
-        Fetch temporary R2 credentials for the current lightning storage connection.
-        """
+        """Fetch temporary R2 credentials for the current lightning storage connection."""
         import json
+
         import requests
-        
+
         try:
             # Get Lightning Cloud API token
             cloud_url = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
             api_key = os.getenv("LIGHTNING_API_KEY")
             username = os.getenv("LIGHTNING_USERNAME")
             project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID")
-            
+
             if not all([api_key, username, project_id]):
                 raise RuntimeError("Missing required environment variables")
-                
+
             # Login to get token
             payload = {"apiKey": api_key, "username": username}
             login_url = f"{cloud_url}/v1/auth/login"
             response = requests.post(login_url, data=json.dumps(payload))
-            
+
             if "token" not in response.json():
                 raise RuntimeError("Failed to get authentication token")
-                
+
             token = response.json()["token"]
-            
+
             # Get temporary bucket credentials
             headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
-            credentials_url = f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
-            
+            credentials_url = (
+                f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
+            )
+
             credentials_response = requests.get(credentials_url, headers=headers)
-            
+
             if credentials_response.status_code != 200:
                 raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")
-                
+
             temp_credentials = credentials_response.json()
 
             endpoint_url = f"https://{temp_credentials['accountId']}.r2.cloudflarestorage.com"
-            
+
             # Format credentials for S3Client
             return {
                 "aws_access_key_id": temp_credentials["accessKeyId"],
                 "aws_secret_access_key": temp_credentials["secretAccessKey"],
                 "aws_session_token": temp_credentials["sessionToken"],
                 "endpoint_url": endpoint_url,
-                "region_name": "auto"
+                "region_name": "auto",
             }
-            
+
         except Exception as e:
             # Fallback to hardcoded credentials if API call fails
             print(f"Failed to get R2 credentials from API: {e}. Using fallback credentials.")
diff --git a/tests/streaming/test_fs_provider.py b/tests/streaming/test_fs_provider.py
index 97958e1bc..751895d0a 100644
--- a/tests/streaming/test_fs_provider.py
+++ b/tests/streaming/test_fs_provider.py
@@ -5,8 +5,8 @@
 from litdata.streaming import fs_provider as fs_provider_module
 from litdata.streaming.fs_provider import (
     GCPFsProvider,
-    S3FsProvider,
     R2FsProvider,
+    S3FsProvider,
     _get_fs_provider,
     get_bucket_and_path,
     not_supported_provider,

From 903c7192a3ca47a0caac65dc3ea5aa88110d393a Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Fri, 5 Sep 2025 11:18:52 +0000
Subject: [PATCH 03/33] Create a dedicated R2Client and move
 get_r2_bucket_credentials into that. R2FsProvider inherits from S3FsProvider
 now (duplicate functions removed)

---
 src/litdata/processing/data_processor.py |  49 ++++++++--
 src/litdata/streaming/client.py          | 112 +++++++++++++++++++++++
 src/litdata/streaming/fs_provider.py     |  95 +------------------
 src/litdata/streaming/resolver.py        |  23 ++++-
 4 files changed, 177 insertions(+), 102 deletions(-)

diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
index 7e62fc6ba..edc86e9da 100644
--- a/src/litdata/processing/data_processor.py
+++ b/src/litdata/processing/data_processor.py
@@ -168,7 +168,11 @@ def _download_data_target(
                     dirpath = os.path.dirname(local_path)
                     os.makedirs(dirpath, exist_ok=True)
                     if fs_provider is None:
-                        fs_provider = _get_fs_provider(input_dir.url, storage_options)
+                        # Add data connection ID to storage_options for R2 connections
+                        merged_storage_options = storage_options.copy()
+                        if hasattr(input_dir, 'data_connection_id') and input_dir.data_connection_id:
+                            merged_storage_options["lightning_data_connection_id"] = input_dir.data_connection_id
+                        fs_provider = _get_fs_provider(input_dir.url, merged_storage_options)
                     fs_provider.download_file(path, local_path)
 
                 elif os.path.isfile(path):
@@ -233,7 +237,11 @@ def _upload_fn(
     obj = parse.urlparse(output_dir.url if output_dir.url else output_dir.path)
 
     if obj.scheme in _SUPPORTED_PROVIDERS:
-        fs_provider = _get_fs_provider(output_dir.url, storage_options)
+        # Add data connection ID to storage_options for R2 connections
+        merged_storage_options = storage_options.copy()
+        if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+            merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+        fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     while True:
         data: Optional[Union[str, tuple[str, str]]] = upload_queue.get()
@@ -1022,7 +1030,12 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
             local_filepath = os.path.join(cache_dir, _INDEX_FILENAME)
 
         if obj.scheme in _SUPPORTED_PROVIDERS:
-            fs_provider = _get_fs_provider(output_dir.url, self.storage_options)
+            # Add data connection ID to storage_options for R2 connections
+            merged_storage_options = self.storage_options.copy()
+            if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+                merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+            
+            fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
             fs_provider.upload_file(
                 local_filepath,
                 os.path.join(output_dir.url, os.path.basename(local_filepath)),
@@ -1044,8 +1057,13 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
                 remote_filepath = os.path.join(output_dir_path, f"{node_rank}-{_INDEX_FILENAME}")
                 node_index_filepath = os.path.join(cache_dir, os.path.basename(remote_filepath))
                 if obj.scheme in _SUPPORTED_PROVIDERS:
-                    _wait_for_file_to_exist(remote_filepath, storage_options=self.storage_options)
-                    fs_provider = _get_fs_provider(remote_filepath, self.storage_options)
+                    # Add data connection ID to storage_options for R2 connections
+                    merged_storage_options = self.storage_options.copy()
+                    if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+                        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+                    
+                    _wait_for_file_to_exist(remote_filepath, storage_options=merged_storage_options)
+                    fs_provider = _get_fs_provider(remote_filepath, merged_storage_options)
                     fs_provider.download_file(remote_filepath, node_index_filepath)
                 elif output_dir.path and os.path.isdir(output_dir.path):
                     shutil.copyfile(remote_filepath, node_index_filepath)
@@ -1500,7 +1518,12 @@ def _cleanup_checkpoints(self) -> None:
         prefix = self.output_dir.url.rstrip("/") + "/"
         checkpoint_prefix = os.path.join(prefix, ".checkpoints")
 
-        fs_provider = _get_fs_provider(self.output_dir.url, self.storage_options)
+        # Add data connection ID to storage_options for R2 connections
+        merged_storage_options = self.storage_options.copy()
+        if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+            merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+        
+        fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
         fs_provider.delete_file_or_directory(checkpoint_prefix)
 
     def _save_current_config(self, workers_user_items: list[list[Any]]) -> None:
@@ -1530,7 +1553,12 @@ def _save_current_config(self, workers_user_items: list[list[Any]]) -> None:
             if obj.scheme not in _SUPPORTED_PROVIDERS:
                 not_supported_provider(self.output_dir.url)
 
-            fs_provider = _get_fs_provider(self.output_dir.url, self.storage_options)
+            # Add data connection ID to storage_options for R2 connections
+            merged_storage_options = self.storage_options.copy()
+            if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+                merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+            
+            fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
 
             prefix = self.output_dir.url.rstrip("/") + "/" + ".checkpoints/"
 
@@ -1601,7 +1629,12 @@ def _load_checkpoint_config(self, workers_user_items: list[list[Any]]) -> None:
 
         # download all the checkpoint files in tempdir and read them
         with tempfile.TemporaryDirectory() as temp_dir:
-            fs_provider = _get_fs_provider(self.output_dir.url, self.storage_options)
+            # Add data connection ID to storage_options for R2 connections
+            merged_storage_options = self.storage_options.copy()
+            if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+                merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+            
+            fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
             saved_file_dir = fs_provider.download_directory(prefix, temp_dir)
 
             if not os.path.exists(os.path.join(saved_file_dir, "config.json")):
diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index 3b1c157f5..97356a7b4 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -76,3 +76,115 @@ def client(self) -> Any:
             self._last_time = time()
 
         return self._client
+
+
+class R2Client:
+    """R2 client with refreshable credentials for Cloudflare R2 storage."""
+
+    def __init__(
+        self,
+        refetch_interval: int = 3600, # 1 hour - this is the default refresh interval for R2 credentials
+        storage_options: Optional[dict] = {},
+        session_options: Optional[dict] = {},
+    ) -> None:
+        self._refetch_interval = refetch_interval
+        self._last_time: Optional[float] = None
+        self._client: Optional[Any] = None
+        self._base_storage_options: dict = storage_options or {}
+        self._session_options: dict = session_options or {}
+
+    def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
+        """
+        Fetch temporary R2 credentials for the current lightning storage connection.
+        """
+        import json
+        import requests
+        
+        try:
+            # Get Lightning Cloud API token
+            cloud_url = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
+            api_key = os.getenv("LIGHTNING_API_KEY")
+            username = os.getenv("LIGHTNING_USERNAME")
+            project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID")
+            
+            if not all([api_key, username, project_id]):
+                raise RuntimeError("Missing required environment variables")
+                
+            # Login to get token
+            payload = {"apiKey": api_key, "username": username}
+            login_url = f"{cloud_url}/v1/auth/login"
+            response = requests.post(login_url, data=json.dumps(payload))
+            
+            if "token" not in response.json():
+                raise RuntimeError("Failed to get authentication token")
+                
+            token = response.json()["token"]
+            
+            # Get temporary bucket credentials
+            headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+            credentials_url = f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
+            
+            credentials_response = requests.get(credentials_url, headers=headers)
+            
+            if credentials_response.status_code != 200:
+                raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")
+                
+            temp_credentials = credentials_response.json()
+
+            endpoint_url = f"https://{temp_credentials['accountId']}.r2.cloudflarestorage.com"
+            
+            # Format credentials for S3Client
+            return {
+                "aws_access_key_id": temp_credentials["accessKeyId"],
+                "aws_secret_access_key": temp_credentials["secretAccessKey"],
+                "aws_session_token": temp_credentials["sessionToken"],
+                "endpoint_url": endpoint_url,
+            }
+            
+        except Exception as e:
+            # Fallback to hardcoded credentials if API call fails
+            print(f"Failed to get R2 credentials from API: {e}. Using fallback credentials.")
+            raise RuntimeError(f"Failed to get R2 credentials and no fallback available: {e}")
+
+    def _create_client(self) -> None:
+        """Create a new R2 client with fresh credentials."""
+        # Get data connection ID from storage options
+        data_connection_id = self._base_storage_options.get("lightning_data_connection_id")
+        if not data_connection_id:
+            raise RuntimeError("lightning_data_connection_id is required in storage_options for R2 client")
+        
+        # Get fresh R2 credentials
+        r2_credentials = self.get_r2_bucket_credentials(data_connection_id)
+        
+        # Filter out metadata keys that shouldn't be passed to boto3
+        filtered_storage_options = {
+            k: v for k, v in self._base_storage_options.items() 
+            if k not in ["lightning_data_connection_id"]
+        }
+        
+        # Combine filtered storage options with fresh credentials
+        storage_options = {**filtered_storage_options, **r2_credentials}
+        
+        # Create session and client
+        session = boto3.Session(**self._session_options)
+        self._client = session.client(
+            "s3",
+            **{
+                "config": botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"}),
+                **storage_options,
+            },
+        )
+
+    @property
+    def client(self) -> Any:
+        """Get the R2 client, refreshing credentials if necessary."""
+        if self._client is None:
+            self._create_client()
+            self._last_time = time()
+
+        # Re-generate credentials when they expire
+        if self._last_time is None or (time() - self._last_time) > self._refetch_interval:
+            self._create_client()
+            self._last_time = time()
+
+        return self._client
diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index a9462430b..f008b8c8e 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -16,7 +16,7 @@
 from urllib import parse
 
 from litdata.constants import _GOOGLE_STORAGE_AVAILABLE, _SUPPORTED_PROVIDERS
-from litdata.streaming.client import S3Client
+from litdata.streaming.client import S3Client, R2Client
 
 
 class FsProvider(ABC):
@@ -223,74 +223,12 @@ def is_empty(self, path: str) -> bool:
 
         return not objects["KeyCount"] > 0
 
-
-class R2FsProvider(FsProvider):
+class R2FsProvider(S3FsProvider):
     def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
         super().__init__(storage_options=storage_options)
-
-        # Get data connection ID from environment variable (set by resolver)
-        data_connection_id = os.getenv("LIGHTNING_DATA_CONNECTION_ID")
-
-        # Fetch R2 credentials from the Lightning platform and add them to the storage options
-        r2_credentials = self.get_r2_bucket_credentials(data_connection_id=data_connection_id)
-        storage_options = {**storage_options, **r2_credentials}
-
-        self.client = S3Client(storage_options=storage_options)
-
-    def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
-        """Fetch temporary R2 credentials for the current lightning storage connection."""
-        import json
-
-        import requests
-
-        try:
-            # Get Lightning Cloud API token
-            cloud_url = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
-            api_key = os.getenv("LIGHTNING_API_KEY")
-            username = os.getenv("LIGHTNING_USERNAME")
-            project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID")
-
-            if not all([api_key, username, project_id]):
-                raise RuntimeError("Missing required environment variables")
-
-            # Login to get token
-            payload = {"apiKey": api_key, "username": username}
-            login_url = f"{cloud_url}/v1/auth/login"
-            response = requests.post(login_url, data=json.dumps(payload))
-
-            if "token" not in response.json():
-                raise RuntimeError("Failed to get authentication token")
-
-            token = response.json()["token"]
-
-            # Get temporary bucket credentials
-            headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
-            credentials_url = (
-                f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
-            )
-
-            credentials_response = requests.get(credentials_url, headers=headers)
-
-            if credentials_response.status_code != 200:
-                raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")
-
-            temp_credentials = credentials_response.json()
-
-            endpoint_url = f"https://{temp_credentials['accountId']}.r2.cloudflarestorage.com"
-
-            # Format credentials for S3Client
-            return {
-                "aws_access_key_id": temp_credentials["accessKeyId"],
-                "aws_secret_access_key": temp_credentials["secretAccessKey"],
-                "aws_session_token": temp_credentials["sessionToken"],
-                "endpoint_url": endpoint_url,
-                "region_name": "auto",
-            }
-
-        except Exception as e:
-            # Fallback to hardcoded credentials if API call fails
-            print(f"Failed to get R2 credentials from API: {e}. Using fallback credentials.")
-            raise RuntimeError(f"Failed to get R2 credentials and no fallback available: {e}")
+        
+        # Create R2Client with refreshable credentials
+        self.client = R2Client(storage_options=storage_options)
 
     def upload_file(self, local_path: str, remote_path: str) -> None:
         bucket_name, blob_path = get_bucket_and_path(remote_path, "r2")
@@ -329,18 +267,6 @@ def download_directory(self, remote_path: str, local_directory_name: str) -> str
 
         return saved_file_dir
 
-    def copy(self, remote_source: str, remote_destination: str) -> None:
-        input_obj = parse.urlparse(remote_source)
-        output_obj = parse.urlparse(remote_destination)
-        self.client.client.copy(
-            {"Bucket": input_obj.netloc, "Key": input_obj.path.lstrip("/")},
-            output_obj.netloc,
-            output_obj.path.lstrip("/"),
-        )
-
-    def list_directory(self, path: str) -> list[str]:
-        raise NotImplementedError
-
     def delete_file_or_directory(self, path: str) -> None:
         """Delete the file or the directory."""
         bucket_name, blob_path = get_bucket_and_path(path, "r2")
@@ -367,17 +293,6 @@ def exists(self, path: str) -> bool:
         except Exception as e:
             raise e
 
-    def is_empty(self, path: str) -> bool:
-        obj = parse.urlparse(path)
-
-        objects = self.client.client.list_objects_v2(
-            Bucket=obj.netloc,
-            Delimiter="/",
-            Prefix=obj.path.lstrip("/").rstrip("/") + "/",
-        )
-
-        return not objects["KeyCount"] > 0
-
 
 def get_bucket_and_path(remote_filepath: str, expected_scheme: str = "s3") -> tuple[str, str]:
     """Parse the remote filepath and return the bucket name and the blob path.
diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index 86172f6bf..47a670dd8 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -39,6 +39,7 @@ class Dir:
 
     path: Optional[str] = None
     url: Optional[str] = None
+    data_connection_id: Optional[str] = None
 
 
 class CloudProvider(str, Enum):
@@ -48,7 +49,11 @@ class CloudProvider(str, Enum):
 
 def _resolve_dir(dir_path: Optional[Union[str, Path, Dir]]) -> Dir:
     if isinstance(dir_path, Dir):
-        return Dir(path=str(dir_path.path) if dir_path.path else None, url=str(dir_path.url) if dir_path.url else None)
+        return Dir(
+            path=str(dir_path.path) if dir_path.path else None, 
+            url=str(dir_path.url) if dir_path.url else None,
+            data_connection_id=dir_path.data_connection_id
+        )
 
     if dir_path is None:
         return Dir()
@@ -236,7 +241,7 @@ def _resolve_gcs_folders(dir_path: str) -> Dir:
 def _resolve_lightning_storage(dir_path: str) -> Dir:
     data_connection = _resolve_data_connection(dir_path)
 
-    return Dir(path=dir_path, url=os.path.join(data_connection.r2.source, *dir_path.split("/")[4:]))
+    return Dir(path=dir_path, url=os.path.join(data_connection.r2.source, *dir_path.split("/")[4:]), data_connection_id=data_connection[0].id)
 
 
 def _resolve_datasets(dir_path: str) -> Dir:
@@ -301,7 +306,12 @@ def _assert_dir_is_empty(
     if obj.scheme not in _SUPPORTED_PROVIDERS:
         not_supported_provider(output_dir.url)
 
-    fs_provider = _get_fs_provider(output_dir.url, storage_options)
+    # Add data connection ID to storage_options for R2 connections
+    merged_storage_options = storage_options.copy()
+    if output_dir.data_connection_id:
+        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+    
+    fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     is_empty = fs_provider.is_empty(output_dir.url)
 
@@ -365,7 +375,12 @@ def _assert_dir_has_index_file(
     if obj.scheme not in _SUPPORTED_PROVIDERS:
         not_supported_provider(output_dir.url)
 
-    fs_provider = _get_fs_provider(output_dir.url, storage_options)
+    # Add data connection ID to storage_options for R2 connections
+    merged_storage_options = storage_options.copy()
+    if output_dir.data_connection_id:
+        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+    
+    fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     prefix = output_dir.url.rstrip("/") + "/"
 

From 79fd5ea6ea71544637ef48845bb5a42361ca2eca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 5 Sep 2025 11:19:35 +0000
Subject: [PATCH 04/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/litdata/processing/data_processor.py | 24 ++++++-------
 src/litdata/streaming/client.py          | 44 ++++++++++++------------
 src/litdata/streaming/fs_provider.py     |  5 +--
 src/litdata/streaming/resolver.py        |  8 ++---
 4 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
index edc86e9da..2ce3481db 100644
--- a/src/litdata/processing/data_processor.py
+++ b/src/litdata/processing/data_processor.py
@@ -170,7 +170,7 @@ def _download_data_target(
                     if fs_provider is None:
                         # Add data connection ID to storage_options for R2 connections
                         merged_storage_options = storage_options.copy()
-                        if hasattr(input_dir, 'data_connection_id') and input_dir.data_connection_id:
+                        if hasattr(input_dir, "data_connection_id") and input_dir.data_connection_id:
                             merged_storage_options["lightning_data_connection_id"] = input_dir.data_connection_id
                         fs_provider = _get_fs_provider(input_dir.url, merged_storage_options)
                     fs_provider.download_file(path, local_path)
@@ -239,7 +239,7 @@ def _upload_fn(
     if obj.scheme in _SUPPORTED_PROVIDERS:
         # Add data connection ID to storage_options for R2 connections
         merged_storage_options = storage_options.copy()
-        if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+        if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
             merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
         fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
@@ -1032,9 +1032,9 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
         if obj.scheme in _SUPPORTED_PROVIDERS:
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
-            if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+            if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
                 merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
-            
+
             fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
             fs_provider.upload_file(
                 local_filepath,
@@ -1059,9 +1059,9 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
                 if obj.scheme in _SUPPORTED_PROVIDERS:
                     # Add data connection ID to storage_options for R2 connections
                     merged_storage_options = self.storage_options.copy()
-                    if hasattr(output_dir, 'data_connection_id') and output_dir.data_connection_id:
+                    if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
                         merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
-                    
+
                     _wait_for_file_to_exist(remote_filepath, storage_options=merged_storage_options)
                     fs_provider = _get_fs_provider(remote_filepath, merged_storage_options)
                     fs_provider.download_file(remote_filepath, node_index_filepath)
@@ -1520,9 +1520,9 @@ def _cleanup_checkpoints(self) -> None:
 
         # Add data connection ID to storage_options for R2 connections
         merged_storage_options = self.storage_options.copy()
-        if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+        if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
             merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
-        
+
         fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
         fs_provider.delete_file_or_directory(checkpoint_prefix)
 
@@ -1555,9 +1555,9 @@ def _save_current_config(self, workers_user_items: list[list[Any]]) -> None:
 
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
-            if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+            if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
                 merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
-            
+
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
 
             prefix = self.output_dir.url.rstrip("/") + "/" + ".checkpoints/"
@@ -1631,9 +1631,9 @@ def _load_checkpoint_config(self, workers_user_items: list[list[Any]]) -> None:
         with tempfile.TemporaryDirectory() as temp_dir:
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
-            if hasattr(self.output_dir, 'data_connection_id') and self.output_dir.data_connection_id:
+            if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
                 merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
-            
+
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
             saved_file_dir = fs_provider.download_directory(prefix, temp_dir)
 
diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index 97356a7b4..09bcd137b 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -83,7 +83,7 @@ class R2Client:
 
     def __init__(
         self,
-        refetch_interval: int = 3600, # 1 hour - this is the default refresh interval for R2 credentials
+        refetch_interval: int = 3600,  # 1 hour - this is the default refresh interval for R2 credentials
         storage_options: Optional[dict] = {},
         session_options: Optional[dict] = {},
     ) -> None:
@@ -94,45 +94,46 @@ def __init__(
         self._session_options: dict = session_options or {}
 
     def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
-        """
-        Fetch temporary R2 credentials for the current lightning storage connection.
-        """
+        """Fetch temporary R2 credentials for the current lightning storage connection."""
         import json
+
         import requests
-        
+
         try:
             # Get Lightning Cloud API token
             cloud_url = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai")
             api_key = os.getenv("LIGHTNING_API_KEY")
             username = os.getenv("LIGHTNING_USERNAME")
             project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID")
-            
+
             if not all([api_key, username, project_id]):
                 raise RuntimeError("Missing required environment variables")
-                
+
             # Login to get token
             payload = {"apiKey": api_key, "username": username}
             login_url = f"{cloud_url}/v1/auth/login"
             response = requests.post(login_url, data=json.dumps(payload))
-            
+
             if "token" not in response.json():
                 raise RuntimeError("Failed to get authentication token")
-                
+
             token = response.json()["token"]
-            
+
             # Get temporary bucket credentials
             headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
-            credentials_url = f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
-            
+            credentials_url = (
+                f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
+            )
+
             credentials_response = requests.get(credentials_url, headers=headers)
-            
+
             if credentials_response.status_code != 200:
                 raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")
-                
+
             temp_credentials = credentials_response.json()
 
             endpoint_url = f"https://{temp_credentials['accountId']}.r2.cloudflarestorage.com"
-            
+
             # Format credentials for S3Client
             return {
                 "aws_access_key_id": temp_credentials["accessKeyId"],
@@ -140,7 +141,7 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
                 "aws_session_token": temp_credentials["sessionToken"],
                 "endpoint_url": endpoint_url,
             }
-            
+
         except Exception as e:
             # Fallback to hardcoded credentials if API call fails
             print(f"Failed to get R2 credentials from API: {e}. Using fallback credentials.")
@@ -152,19 +153,18 @@ def _create_client(self) -> None:
         data_connection_id = self._base_storage_options.get("lightning_data_connection_id")
         if not data_connection_id:
             raise RuntimeError("lightning_data_connection_id is required in storage_options for R2 client")
-        
+
         # Get fresh R2 credentials
         r2_credentials = self.get_r2_bucket_credentials(data_connection_id)
-        
+
         # Filter out metadata keys that shouldn't be passed to boto3
         filtered_storage_options = {
-            k: v for k, v in self._base_storage_options.items() 
-            if k not in ["lightning_data_connection_id"]
+            k: v for k, v in self._base_storage_options.items() if k not in ["lightning_data_connection_id"]
         }
-        
+
         # Combine filtered storage options with fresh credentials
         storage_options = {**filtered_storage_options, **r2_credentials}
-        
+
         # Create session and client
         session = boto3.Session(**self._session_options)
         self._client = session.client(
diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index f008b8c8e..24e62df56 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -16,7 +16,7 @@
 from urllib import parse
 
 from litdata.constants import _GOOGLE_STORAGE_AVAILABLE, _SUPPORTED_PROVIDERS
-from litdata.streaming.client import S3Client, R2Client
+from litdata.streaming.client import R2Client, S3Client
 
 
 class FsProvider(ABC):
@@ -223,10 +223,11 @@ def is_empty(self, path: str) -> bool:
 
         return not objects["KeyCount"] > 0
 
+
 class R2FsProvider(S3FsProvider):
     def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
         super().__init__(storage_options=storage_options)
-        
+
         # Create R2Client with refreshable credentials
         self.client = R2Client(storage_options=storage_options)
 
diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index 47a670dd8..20b49e76a 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -50,9 +50,9 @@ class CloudProvider(str, Enum):
 def _resolve_dir(dir_path: Optional[Union[str, Path, Dir]]) -> Dir:
     if isinstance(dir_path, Dir):
         return Dir(
-            path=str(dir_path.path) if dir_path.path else None, 
+            path=str(dir_path.path) if dir_path.path else None,
             url=str(dir_path.url) if dir_path.url else None,
-            data_connection_id=dir_path.data_connection_id
+            data_connection_id=dir_path.data_connection_id,
         )
 
     if dir_path is None:
@@ -310,7 +310,7 @@ def _assert_dir_is_empty(
     merged_storage_options = storage_options.copy()
     if output_dir.data_connection_id:
         merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
-    
+
     fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     is_empty = fs_provider.is_empty(output_dir.url)
@@ -379,7 +379,7 @@ def _assert_dir_has_index_file(
     merged_storage_options = storage_options.copy()
     if output_dir.data_connection_id:
         merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
-    
+
     fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     prefix = output_dir.url.rstrip("/") + "/"

From 62f8327b301578fa04bceba7e183d104e2879509 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Fri, 5 Sep 2025 12:45:38 +0000
Subject: [PATCH 05/33] Move back to R2FsProvider inheriting from FsProvider to
 avoid self.client class type mismatch

---
 src/litdata/streaming/fs_provider.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index 24e62df56..f1a3683db 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -224,7 +224,7 @@ def is_empty(self, path: str) -> bool:
         return not objects["KeyCount"] > 0
 
 
-class R2FsProvider(S3FsProvider):
+class R2FsProvider(FsProvider):
     def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
         super().__init__(storage_options=storage_options)
 
@@ -268,6 +268,18 @@ def download_directory(self, remote_path: str, local_directory_name: str) -> str
 
         return saved_file_dir
 
+    def copy(self, remote_source: str, remote_destination: str) -> None:
+        input_obj = parse.urlparse(remote_source)
+        output_obj = parse.urlparse(remote_destination)
+        self.client.client.copy(
+            {"Bucket": input_obj.netloc, "Key": input_obj.path.lstrip("/")},
+            output_obj.netloc,
+            output_obj.path.lstrip("/"),
+        )
+
+    def list_directory(self, path: str) -> list[str]:
+        raise NotImplementedError
+
     def delete_file_or_directory(self, path: str) -> None:
         """Delete the file or the directory."""
         bucket_name, blob_path = get_bucket_and_path(path, "r2")
@@ -294,6 +306,17 @@ def exists(self, path: str) -> bool:
         except Exception as e:
             raise e
 
+    def is_empty(self, path: str) -> bool:
+        obj = parse.urlparse(path)
+
+        objects = self.client.client.list_objects_v2(
+            Bucket=obj.netloc,
+            Delimiter="/",
+            Prefix=obj.path.lstrip("/").rstrip("/") + "/",
+        )
+
+        return not objects["KeyCount"] > 0
+
 
 def get_bucket_and_path(remote_filepath: str, expected_scheme: str = "s3") -> tuple[str, str]:
     """Parse the remote filepath and return the bucket name and the blob path.

From 46b808647b85402e0d1ca73a755dd1c862e8d67c Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Fri, 5 Sep 2025 12:45:50 +0000
Subject: [PATCH 06/33] Use existing patterns when callling requests.post/get

---
 src/litdata/streaming/client.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index 09bcd137b..cccbcddbb 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -112,8 +112,8 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
             # Login to get token
             payload = {"apiKey": api_key, "username": username}
             login_url = f"{cloud_url}/v1/auth/login"
-            response = requests.post(login_url, data=json.dumps(payload))
-
+            response = requests.post(login_url, data=json.dumps(payload))  # noqa: S113
+            
             if "token" not in response.json():
                 raise RuntimeError("Failed to get authentication token")
 
@@ -125,7 +125,7 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
                 f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
             )
 
-            credentials_response = requests.get(credentials_url, headers=headers)
+            credentials_response = requests.get(credentials_url, headers=headers, timeout=10)
 
             if credentials_response.status_code != 200:
                 raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")

From 2568398d2b7a8e703914df6a05950a58c663f898 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 5 Sep 2025 12:47:30 +0000
Subject: [PATCH 07/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/litdata/streaming/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index cccbcddbb..cb56d4038 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -113,7 +113,7 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
             payload = {"apiKey": api_key, "username": username}
             login_url = f"{cloud_url}/v1/auth/login"
             response = requests.post(login_url, data=json.dumps(payload))  # noqa: S113
-            
+
             if "token" not in response.json():
                 raise RuntimeError("Failed to get authentication token")
 

From 5807a3106cb59ef778cf1782070759df3ff6b7bc Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Fri, 5 Sep 2025 13:20:57 +0000
Subject: [PATCH 08/33] Rename lightning_data_connection_id to
 data_connection_id

---
 src/litdata/processing/data_processor.py | 14 +++++++-------
 src/litdata/streaming/client.py          |  6 +++---
 src/litdata/streaming/resolver.py        |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
index 2ce3481db..1fac5093a 100644
--- a/src/litdata/processing/data_processor.py
+++ b/src/litdata/processing/data_processor.py
@@ -171,7 +171,7 @@ def _download_data_target(
                         # Add data connection ID to storage_options for R2 connections
                         merged_storage_options = storage_options.copy()
                         if hasattr(input_dir, "data_connection_id") and input_dir.data_connection_id:
-                            merged_storage_options["lightning_data_connection_id"] = input_dir.data_connection_id
+                            merged_storage_options["data_connection_id"] = input_dir.data_connection_id
                         fs_provider = _get_fs_provider(input_dir.url, merged_storage_options)
                     fs_provider.download_file(path, local_path)
 
@@ -240,7 +240,7 @@ def _upload_fn(
         # Add data connection ID to storage_options for R2 connections
         merged_storage_options = storage_options.copy()
         if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-            merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+            merged_storage_options["data_connection_id"] = output_dir.data_connection_id
         fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     while True:
@@ -1033,7 +1033,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
             if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-                merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+                merged_storage_options["data_connection_id"] = output_dir.data_connection_id
 
             fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
             fs_provider.upload_file(
@@ -1060,7 +1060,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
                     # Add data connection ID to storage_options for R2 connections
                     merged_storage_options = self.storage_options.copy()
                     if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-                        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+                        merged_storage_options["data_connection_id"] = output_dir.data_connection_id
 
                     _wait_for_file_to_exist(remote_filepath, storage_options=merged_storage_options)
                     fs_provider = _get_fs_provider(remote_filepath, merged_storage_options)
@@ -1521,7 +1521,7 @@ def _cleanup_checkpoints(self) -> None:
         # Add data connection ID to storage_options for R2 connections
         merged_storage_options = self.storage_options.copy()
         if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-            merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+            merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
 
         fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
         fs_provider.delete_file_or_directory(checkpoint_prefix)
@@ -1556,7 +1556,7 @@ def _save_current_config(self, workers_user_items: list[list[Any]]) -> None:
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
             if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-                merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+                merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
 
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
 
@@ -1632,7 +1632,7 @@ def _load_checkpoint_config(self, workers_user_items: list[list[Any]]) -> None:
             # Add data connection ID to storage_options for R2 connections
             merged_storage_options = self.storage_options.copy()
             if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-                merged_storage_options["lightning_data_connection_id"] = self.output_dir.data_connection_id
+                merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
 
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
             saved_file_dir = fs_provider.download_directory(prefix, temp_dir)
diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index cb56d4038..c4070407a 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -150,16 +150,16 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
     def _create_client(self) -> None:
         """Create a new R2 client with fresh credentials."""
         # Get data connection ID from storage options
-        data_connection_id = self._base_storage_options.get("lightning_data_connection_id")
+        data_connection_id = self._base_storage_options.get("data_connection_id")
         if not data_connection_id:
-            raise RuntimeError("lightning_data_connection_id is required in storage_options for R2 client")
+            raise RuntimeError("data_connection_id is required in storage_options for R2 client")
 
         # Get fresh R2 credentials
         r2_credentials = self.get_r2_bucket_credentials(data_connection_id)
 
         # Filter out metadata keys that shouldn't be passed to boto3
         filtered_storage_options = {
-            k: v for k, v in self._base_storage_options.items() if k not in ["lightning_data_connection_id"]
+            k: v for k, v in self._base_storage_options.items() if k not in ["data_connection_id"]
         }
 
         # Combine filtered storage options with fresh credentials
diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index 20b49e76a..21c394697 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -309,7 +309,7 @@ def _assert_dir_is_empty(
     # Add data connection ID to storage_options for R2 connections
     merged_storage_options = storage_options.copy()
     if output_dir.data_connection_id:
-        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+        merged_storage_options["data_connection_id"] = output_dir.data_connection_id
 
     fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
@@ -378,7 +378,7 @@ def _assert_dir_has_index_file(
     # Add data connection ID to storage_options for R2 connections
     merged_storage_options = storage_options.copy()
     if output_dir.data_connection_id:
-        merged_storage_options["lightning_data_connection_id"] = output_dir.data_connection_id
+        merged_storage_options["data_connection_id"] = output_dir.data_connection_id
 
     fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 

From b9c7a7c605e369b62cde95f693061bb3c9f068a8 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Fri, 5 Sep 2025 14:11:41 +0000
Subject: [PATCH 09/33] Add tests

---
 tests/processing/test_data_processor.py | 335 +++++++++++++++++++++
 tests/streaming/test_client.py          | 367 ++++++++++++++++++++++++
 2 files changed, 702 insertions(+)

diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py
index 76778265d..b5d321efd 100644
--- a/tests/processing/test_data_processor.py
+++ b/tests/processing/test_data_processor.py
@@ -1,7 +1,9 @@
+import json
 import multiprocessing as mp
 import os
 import random
 import sys
+from contextlib import suppress
 from functools import partial
 from io import BytesIO
 from queue import Empty
@@ -1323,3 +1325,336 @@ def test_base_worker_collect_paths_no_downloader(keep_data_ordered):
 
     for index in range(10):
         assert worker.ready_to_process_queue.get() == (index, index, None)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_download_data_target_with_data_connection_id(tmpdir, monkeypatch):
+    """Test _download_data_target passes data_connection_id to fs_provider correctly."""
+    input_dir = os.path.join(tmpdir, "input_dir")
+    os.makedirs(input_dir, exist_ok=True)
+
+    cache_dir = os.path.join(tmpdir, "cache_dir")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    queue_in = mock.MagicMock()
+    queue_out = mock.MagicMock()
+
+    # Mock data with data_connection_id
+    test_connection_id = "test-connection-123"
+    input_dir_obj = Dir(path=input_dir, url="s3://test-bucket")
+    input_dir_obj.data_connection_id = test_connection_id
+
+    items = [10]
+    paths = ["s3://test-bucket/a.txt", None]
+
+    def fn(*_, **__):
+        value = paths.pop(0)
+        if value is None:
+            return value
+        return (0, items.pop(0), [value])
+
+    queue_in.get = fn
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    storage_options = {"key": "value"}
+
+    _download_data_target(input_dir_obj, cache_dir, queue_in, queue_out, storage_options)
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(input_dir_obj.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_upload_fn_with_data_connection_id(tmpdir, monkeypatch):
+    """Test _upload_fn passes data_connection_id to fs_provider correctly."""
+    cache_dir = os.path.join(tmpdir, "cache_dir")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Create test file to upload
+    test_file = os.path.join(cache_dir, "test.txt")
+    with open(test_file, "w") as f:
+        f.write("test content")
+
+    upload_queue = mock.MagicMock()
+    remove_queue = mock.MagicMock()
+
+    # Mock data with data_connection_id
+    test_connection_id = "test-connection-456"
+    output_dir = Dir(path=None, url="s3://output-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    paths = [test_file, None]
+
+    def fn(*_, **__):
+        return paths.pop(0)
+
+    upload_queue.get = fn
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    storage_options = {"region": "us-west-2"}
+
+    _upload_fn(upload_queue, remove_queue, cache_dir, output_dir, storage_options)
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(output_dir.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_chunk_recipe_upload_index_with_data_connection_id(tmpdir, monkeypatch):
+    """Test DataChunkRecipe._upload_index passes data_connection_id correctly."""
+    cache_dir = os.path.join(tmpdir, "cache_dir")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Create test index file
+    index_file = os.path.join(cache_dir, "index.json")
+    with open(index_file, "w") as f:
+        f.write('{"test": "data"}')
+
+    # Mock data with data_connection_id
+    test_connection_id = "test-connection-789"
+    output_dir = Dir(path=None, url="s3://output-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    storage_options = {"timeout": 30}
+    recipe = DataChunkRecipe(storage_options=storage_options)
+
+    recipe._upload_index(output_dir, cache_dir, num_nodes=1, node_rank=None)
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(output_dir.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_processor_cleanup_checkpoints_with_data_connection_id(tmpdir, monkeypatch):
+    """Test DataProcessor._cleanup_checkpoints passes data_connection_id correctly."""
+    test_connection_id = "test-connection-cleanup"
+    output_dir = Dir(path=None, url="s3://cleanup-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    storage_options = {"max_retries": 3}
+    data_processor = DataProcessor(input_dir=str(tmpdir), output_dir=output_dir, storage_options=storage_options)
+
+    data_processor._cleanup_checkpoints()
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(output_dir.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_processor_save_current_config_with_data_connection_id(tmpdir, monkeypatch):
+    """Test DataProcessor._save_current_config passes data_connection_id correctly."""
+    test_connection_id = "test-connection-save"
+    output_dir = Dir(path=None, url="s3://config-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    storage_options = {"connect_timeout": 10}
+    data_processor = DataProcessor(
+        input_dir=str(tmpdir), output_dir=output_dir, use_checkpoint=True, storage_options=storage_options
+    )
+
+    workers_user_items = [[1, 2], [3, 4]]
+    data_processor._save_current_config(workers_user_items)
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(output_dir.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_processor_load_checkpoint_config_with_data_connection_id(tmpdir, monkeypatch):
+    """Test DataProcessor._load_checkpoint_config passes data_connection_id correctly."""
+    test_connection_id = "test-connection-load"
+    output_dir = Dir(path=None, url="s3://load-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    # Mock fs_provider
+    fs_provider = mock.MagicMock()
+    fs_provider.download_directory = mock.MagicMock(return_value=str(tmpdir))
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    # Create mock config file
+    config_data = {"num_workers": 2, "workers_user_items": [[1, 2], [3, 4]]}
+    config_file = os.path.join(tmpdir, "config.json")
+    with open(config_file, "w") as f:
+        json.dump(config_data, f)
+
+    storage_options = {"read_timeout": 15}
+    data_processor = DataProcessor(
+        input_dir=str(tmpdir),
+        output_dir=output_dir,
+        use_checkpoint=True,
+        num_workers=2,
+        storage_options=storage_options,
+    )
+
+    workers_user_items = [[1, 2], [3, 4]]
+    data_processor._load_checkpoint_config(workers_user_items)
+
+    # Verify fs_provider was called with merged storage_options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+    get_fs_provider_mock.assert_called_with(output_dir.url, expected_storage_options)
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_connection_id_not_added_when_missing():
+    """Test that data_connection_id is not added to storage_options when not present on Dir."""
+    # Test with Dir object with default data_connection_id (None)
+    output_dir = Dir(path=None, url="s3://test-bucket")
+
+    # Verify that data_connection_id defaults to None
+    assert output_dir.data_connection_id is None
+
+    # Test with Dir object with data_connection_id explicitly set to None
+    output_dir_none = Dir(path=None, url="s3://test-bucket", data_connection_id=None)
+
+    assert output_dir_none.data_connection_id is None
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_storage_options_preserved_with_data_connection_id():
+    """Test that original storage_options are preserved when adding data_connection_id."""
+    original_storage_options = {
+        "aws_access_key_id": "test-key",
+        "aws_secret_access_key": "test-secret",
+        "region_name": "us-east-1",
+    }
+
+    test_connection_id = "test-connection-preserve"
+
+    # Simulate the merge operation that happens in the code
+    merged_storage_options = original_storage_options.copy()
+    merged_storage_options["data_connection_id"] = test_connection_id
+
+    # Verify original is unchanged
+    assert "data_connection_id" not in original_storage_options
+    assert len(original_storage_options) == 3
+
+    # Verify merged has all original keys plus data_connection_id
+    assert len(merged_storage_options) == 4
+    assert merged_storage_options["data_connection_id"] == test_connection_id
+    assert merged_storage_options["aws_access_key_id"] == "test-key"
+    assert merged_storage_options["aws_secret_access_key"] == "test-secret"
+    assert merged_storage_options["region_name"] == "us-east-1"
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_connection_id_overrides_existing_value():
+    """Test that data_connection_id from Dir overrides any existing value in storage_options."""
+    original_storage_options = {"data_connection_id": "original-connection-id", "timeout": 30}
+
+    dir_connection_id = "dir-connection-id"
+
+    # Simulate the merge operation that happens in the code
+    merged_storage_options = original_storage_options.copy()
+    merged_storage_options["data_connection_id"] = dir_connection_id
+
+    # Verify the Dir's data_connection_id overrides the original
+    assert merged_storage_options["data_connection_id"] == dir_connection_id
+    assert merged_storage_options["timeout"] == 30
+    assert len(merged_storage_options) == 2
+
+
+class CustomDataChunkRecipeWithConnectionId(DataChunkRecipe):
+    """Custom recipe for testing data_connection_id integration."""
+
+    is_generator = False
+
+    def prepare_structure(self, input_dir: str) -> list[Any]:
+        return ["test_item"]
+
+    def prepare_item(self, item):
+        return {"data": item}
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_data_processor_end_to_end_with_data_connection_id(tmpdir, monkeypatch):
+    """Test full data processing pipeline with data_connection_id."""
+    test_connection_id = "test-connection-e2e"
+
+    # Setup input and output dirs with data_connection_id
+    input_dir = Dir(path=str(tmpdir), url="s3://input-bucket")
+    input_dir.data_connection_id = test_connection_id
+
+    output_dir = Dir(path=None, url="s3://output-bucket")
+    output_dir.data_connection_id = test_connection_id
+
+    # Mock fs_provider calls
+    fs_provider = mock.MagicMock()
+    fs_provider.exists = mock.MagicMock(return_value=True)
+    fs_provider.download_file = mock.MagicMock()
+    fs_provider.upload_file = mock.MagicMock()
+
+    get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
+    monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+
+    # Mock other dependencies
+    monkeypatch.setattr(data_processor_module, "_wait_for_disk_usage_higher_than_threshold", mock.MagicMock())
+
+    cache_dir = os.path.join(tmpdir, "cache")
+    data_cache_dir = os.path.join(tmpdir, "data_cache")
+    monkeypatch.setenv("DATA_OPTIMIZER_CACHE_FOLDER", cache_dir)
+    monkeypatch.setenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", data_cache_dir)
+
+    storage_options = {"custom_option": "test_value"}
+
+    data_processor = DataProcessor(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        num_workers=1,
+        fast_dev_run=1,
+        storage_options=storage_options,
+        verbose=False,
+    )
+
+    # Run with custom recipe
+    recipe = CustomDataChunkRecipeWithConnectionId(storage_options=storage_options)
+
+    with suppress(Exception):
+        # Expected to fail due to mocking, but we want to verify the calls were made
+        data_processor.run(recipe)
+
+    # Verify that fs_provider was called with data_connection_id
+    # Check if any of the calls included the expected storage options
+    calls_made = get_fs_provider_mock.call_args_list
+
+    # Should have been called with merged storage options including data_connection_id
+    expected_storage_options = storage_options.copy()
+    expected_storage_options["data_connection_id"] = test_connection_id
+
+    # Note: Due to the complexity of mocking the full pipeline, we mainly verify
+    # that the fs_provider was called, indicating the data_connection_id code paths were executed
+    assert len(calls_made) > 0, "fs_provider should have been called"
diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
index 636734a78..39cede540 100644
--- a/tests/streaming/test_client.py
+++ b/tests/streaming/test_client.py
@@ -102,3 +102,370 @@ def test_s3_client_with_cloud_space_id(use_shared_credentials, monkeypatch):
     assert len(boto3_session().client._mock_mock_calls) == 9
 
     assert instance_metadata_provider._mock_call_count == 0 if use_shared_credentials else 3
+
+
+# Tests for R2Client functionality
+
+
+def test_r2_client_initialization():
+    """Test R2Client initialization with different parameters."""
+    # Test with default parameters
+    r2_client = client.R2Client()
+    assert r2_client._refetch_interval == 3600  # 1 hour default
+    assert r2_client._last_time is None
+    assert r2_client._client is None
+    assert r2_client._base_storage_options == {}
+    assert r2_client._session_options == {}
+
+    # Test with custom parameters
+    storage_options = {"data_connection_id": "test-connection-123"}
+    session_options = {"region_name": "us-west-2"}
+    r2_client = client.R2Client(refetch_interval=1800, storage_options=storage_options, session_options=session_options)
+    assert r2_client._refetch_interval == 1800
+    assert r2_client._base_storage_options == storage_options
+    assert r2_client._session_options == session_options
+
+
+def test_r2_client_missing_data_connection_id(monkeypatch):
+    """Test R2Client raises error when data_connection_id is missing."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    # Create R2Client without data_connection_id
+    r2_client = client.R2Client(storage_options={})
+
+    # Accessing client should raise error
+    with pytest.raises(RuntimeError, match="data_connection_id is required"):
+        _ = r2_client.client
+
+
+def test_r2_client_get_r2_bucket_credentials_success(monkeypatch):
+    """Test successful R2 credential fetching."""
+    # Mock environment variables
+    monkeypatch.setenv("LIGHTNING_CLOUD_URL", "https://test.lightning.ai")
+    monkeypatch.setenv("LIGHTNING_API_KEY", "test-api-key")
+    monkeypatch.setenv("LIGHTNING_USERNAME", "test-user")
+    monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "test-project-123")
+
+    # Mock requests
+    requests_mock = mock.MagicMock()
+    monkeypatch.setattr("requests.post", requests_mock)
+    monkeypatch.setattr("requests.get", requests_mock)
+
+    # Mock login response
+    login_response = mock.MagicMock()
+    login_response.json.return_value = {"token": "test-token-456"}
+
+    # Mock credentials response
+    credentials_response = mock.MagicMock()
+    credentials_response.status_code = 200
+    credentials_response.json.return_value = {
+        "accessKeyId": "test-access-key",
+        "secretAccessKey": "test-secret-key",
+        "sessionToken": "test-session-token",
+        "accountId": "test-account-id",
+    }
+
+    # Configure mock to return different responses for different calls
+    def mock_request(*args, **kwargs):
+        if "auth/login" in args[0]:
+            return login_response
+        return credentials_response
+
+    requests_mock.side_effect = mock_request
+    monkeypatch.setattr("requests.get", lambda *args, **kwargs: credentials_response)
+
+    r2_client = client.R2Client()
+    credentials = r2_client.get_r2_bucket_credentials("test-connection-789")
+
+    expected_credentials = {
+        "aws_access_key_id": "test-access-key",
+        "aws_secret_access_key": "test-secret-key",
+        "aws_session_token": "test-session-token",
+        "endpoint_url": "https://test-account-id.r2.cloudflarestorage.com",
+    }
+
+    assert credentials == expected_credentials
+
+
+def test_r2_client_get_r2_bucket_credentials_missing_env_vars(monkeypatch):
+    """Test R2 credential fetching fails with missing environment variables."""
+    # Don't set required environment variables
+    monkeypatch.delenv("LIGHTNING_API_KEY", raising=False)
+    monkeypatch.delenv("LIGHTNING_USERNAME", raising=False)
+    monkeypatch.delenv("LIGHTNING_CLOUD_PROJECT_ID", raising=False)
+
+    r2_client = client.R2Client()
+
+    with pytest.raises(RuntimeError, match="Missing required environment variables"):
+        r2_client.get_r2_bucket_credentials("test-connection")
+
+
+def test_r2_client_get_r2_bucket_credentials_login_failure(monkeypatch):
+    """Test R2 credential fetching fails when login fails."""
+    # Mock environment variables
+    monkeypatch.setenv("LIGHTNING_CLOUD_URL", "https://test.lightning.ai")
+    monkeypatch.setenv("LIGHTNING_API_KEY", "test-api-key")
+    monkeypatch.setenv("LIGHTNING_USERNAME", "test-user")
+    monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "test-project-123")
+
+    # Mock failed login response
+    login_response = mock.MagicMock()
+    login_response.json.return_value = {"error": "Invalid credentials"}
+
+    requests_mock = mock.MagicMock(return_value=login_response)
+    monkeypatch.setattr("requests.post", requests_mock)
+
+    r2_client = client.R2Client()
+
+    with pytest.raises(RuntimeError, match="Failed to get authentication token"):
+        r2_client.get_r2_bucket_credentials("test-connection")
+
+
+def test_r2_client_get_r2_bucket_credentials_api_failure(monkeypatch):
+    """Test R2 credential fetching fails when credentials API fails."""
+    # Mock environment variables
+    monkeypatch.setenv("LIGHTNING_CLOUD_URL", "https://test.lightning.ai")
+    monkeypatch.setenv("LIGHTNING_API_KEY", "test-api-key")
+    monkeypatch.setenv("LIGHTNING_USERNAME", "test-user")
+    monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "test-project-123")
+
+    # Mock successful login response
+    login_response = mock.MagicMock()
+    login_response.json.return_value = {"token": "test-token-456"}
+
+    # Mock failed credentials response
+    credentials_response = mock.MagicMock()
+    credentials_response.status_code = 403
+
+    monkeypatch.setattr("requests.post", mock.MagicMock(return_value=login_response))
+    monkeypatch.setattr("requests.get", mock.MagicMock(return_value=credentials_response))
+
+    r2_client = client.R2Client()
+
+    with pytest.raises(RuntimeError, match="Failed to get credentials: 403"):
+        r2_client.get_r2_bucket_credentials("test-connection")
+
+
+def test_r2_client_create_client_success(monkeypatch):
+    """Test successful R2 client creation."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    # Mock the credential fetching method
+    mock_credentials = {
+        "aws_access_key_id": "test-access-key",
+        "aws_secret_access_key": "test-secret-key",
+        "aws_session_token": "test-session-token",
+        "endpoint_url": "https://test-account.r2.cloudflarestorage.com",
+    }
+
+    r2_client = client.R2Client(storage_options={"data_connection_id": "test-connection"})
+    r2_client.get_r2_bucket_credentials = mock.MagicMock(return_value=mock_credentials)
+
+    # Call _create_client
+    r2_client._create_client()
+
+    # Verify boto3 session was created and client was configured correctly
+    boto3_session.assert_called_once()
+    boto3_session().client.assert_called_once_with(
+        "s3",
+        config=botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"}),
+        aws_access_key_id="test-access-key",
+        aws_secret_access_key="test-secret-key",
+        aws_session_token="test-session-token",
+        endpoint_url="https://test-account.r2.cloudflarestorage.com",
+    )
+
+
+def test_r2_client_filters_metadata_from_storage_options(monkeypatch):
+    """Test that R2Client filters out metadata keys from storage options."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    # Mock the credential fetching method
+    mock_credentials = {
+        "aws_access_key_id": "test-access-key",
+        "aws_secret_access_key": "test-secret-key",
+        "aws_session_token": "test-session-token",
+        "endpoint_url": "https://test-account.r2.cloudflarestorage.com",
+    }
+
+    storage_options = {"data_connection_id": "test-connection", "timeout": 30, "region_name": "auto"}
+
+    r2_client = client.R2Client(storage_options=storage_options)
+    r2_client.get_r2_bucket_credentials = mock.MagicMock(return_value=mock_credentials)
+
+    # Call _create_client
+    r2_client._create_client()
+
+    # Verify that data_connection_id was filtered out but other options were preserved
+    expected_call_kwargs = {
+        "config": botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"}),
+        "timeout": 30,
+        "region_name": "auto",
+        "aws_access_key_id": "test-access-key",
+        "aws_secret_access_key": "test-secret-key",
+        "aws_session_token": "test-session-token",
+        "endpoint_url": "https://test-account.r2.cloudflarestorage.com",
+    }
+
+    boto3_session().client.assert_called_once_with("s3", **expected_call_kwargs)
+
+
+def test_r2_client_property_creates_client_on_first_access(monkeypatch):
+    """Test that accessing client property creates client on first access."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    r2_client = client.R2Client(storage_options={"data_connection_id": "test-connection"})
+    r2_client.get_r2_bucket_credentials = mock.MagicMock(
+        return_value={
+            "aws_access_key_id": "test-key",
+            "aws_secret_access_key": "test-secret",
+            "aws_session_token": "test-token",
+            "endpoint_url": "https://test.r2.cloudflarestorage.com",
+        }
+    )
+
+    # Initially no client
+    assert r2_client._client is None
+    assert r2_client._last_time is None
+
+    # Access client property
+    client_instance = r2_client.client
+
+    # Verify client was created
+    assert r2_client._client is not None
+    assert r2_client._last_time is not None
+    assert client_instance == r2_client._client
+
+
+def test_r2_client_property_refreshes_expired_credentials(monkeypatch):
+    """Test that client property refreshes credentials when they expire."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    # Set short refresh interval for testing
+    r2_client = client.R2Client(
+        refetch_interval=1,  # 1 second
+        storage_options={"data_connection_id": "test-connection"},
+    )
+    r2_client.get_r2_bucket_credentials = mock.MagicMock(
+        return_value={
+            "aws_access_key_id": "test-key",
+            "aws_secret_access_key": "test-secret",
+            "aws_session_token": "test-token",
+            "endpoint_url": "https://test.r2.cloudflarestorage.com",
+        }
+    )
+
+    # First access
+    r2_client.client
+    first_call_count = boto3_session().client.call_count
+
+    # Wait for credentials to expire
+    sleep(1.1)
+
+    # Second access should refresh credentials
+    r2_client.client
+    second_call_count = boto3_session().client.call_count
+
+    # Verify client was created twice (initial + refresh)
+    assert second_call_count == first_call_count + 1
+
+
+def test_r2_client_with_session_options(monkeypatch):
+    """Test R2Client with custom session options."""
+    boto3_session = mock.MagicMock()
+    boto3 = mock.MagicMock(Session=boto3_session)
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    session_options = {"profile_name": "test-profile"}
+    r2_client = client.R2Client(
+        storage_options={"data_connection_id": "test-connection"}, session_options=session_options
+    )
+    r2_client.get_r2_bucket_credentials = mock.MagicMock(
+        return_value={
+            "aws_access_key_id": "test-key",
+            "aws_secret_access_key": "test-secret",
+            "aws_session_token": "test-token",
+            "endpoint_url": "https://test.r2.cloudflarestorage.com",
+        }
+    )
+
+    # Access client to trigger creation
+    r2_client.client
+
+    # Verify session was created with custom options
+    boto3.Session.assert_called_once_with(profile_name="test-profile")
+
+
+def test_r2_client_api_call_format(monkeypatch):
+    """Test that R2Client makes correct API calls for credential fetching."""
+    # Mock environment variables
+    monkeypatch.setenv("LIGHTNING_CLOUD_URL", "https://api.lightning.ai")
+    monkeypatch.setenv("LIGHTNING_API_KEY", "sk-test123")
+    monkeypatch.setenv("LIGHTNING_USERNAME", "testuser")
+    monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "proj-456")
+
+    # Mock requests
+    mock_post = mock.MagicMock()
+    mock_get = mock.MagicMock()
+
+    # Mock login response
+    login_response = mock.MagicMock()
+    login_response.json.return_value = {"token": "bearer-token-789"}
+    mock_post.return_value = login_response
+
+    # Mock credentials response
+    credentials_response = mock.MagicMock()
+    credentials_response.status_code = 200
+    credentials_response.json.return_value = {
+        "accessKeyId": "AKIATEST123",
+        "secretAccessKey": "secrettest456",
+        "sessionToken": "sessiontest789",
+        "accountId": "account123",
+    }
+    mock_get.return_value = credentials_response
+
+    monkeypatch.setattr("requests.post", mock_post)
+    monkeypatch.setattr("requests.get", mock_get)
+
+    r2_client = client.R2Client()
+    r2_client.get_r2_bucket_credentials("conn-abc123")
+
+    # Verify login API call
+    mock_post.assert_called_once_with(
+        "https://api.lightning.ai/v1/auth/login", data='{"apiKey": "sk-test123", "username": "testuser"}'
+    )
+
+    # Verify credentials API call
+    mock_get.assert_called_once_with(
+        "https://api.lightning.ai/v1/projects/proj-456/data-connections/conn-abc123/temp-bucket-credentials",
+        headers={"Authorization": "Bearer bearer-token-789", "Content-Type": "application/json"},
+        timeout=10,
+    )

From 861916eafb9b661f2282d25a1d16ff7b7ee06a82 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 8 Sep 2025 13:48:41 +0000
Subject: [PATCH 10/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/litdata/streaming/resolver.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index 21c394697..b6dfbfa01 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -241,7 +241,11 @@ def _resolve_gcs_folders(dir_path: str) -> Dir:
 def _resolve_lightning_storage(dir_path: str) -> Dir:
     data_connection = _resolve_data_connection(dir_path)
 
-    return Dir(path=dir_path, url=os.path.join(data_connection.r2.source, *dir_path.split("/")[4:]), data_connection_id=data_connection[0].id)
+    return Dir(
+        path=dir_path,
+        url=os.path.join(data_connection.r2.source, *dir_path.split("/")[4:]),
+        data_connection_id=data_connection[0].id,
+    )
 
 
 def _resolve_datasets(dir_path: str) -> Dir:

From 34eef2a68a5693358e976631e67c5864a8d749ce Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 13:52:56 +0000
Subject: [PATCH 11/33] Fix type in rebase

---
 src/litdata/streaming/resolver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index b6dfbfa01..132512047 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -244,7 +244,7 @@ def _resolve_lightning_storage(dir_path: str) -> Dir:
     return Dir(
         path=dir_path,
         url=os.path.join(data_connection.r2.source, *dir_path.split("/")[4:]),
-        data_connection_id=data_connection[0].id,
+        data_connection_id=data_connection.id,
     )
 
 

From 0d9f8e81f4d8b1e75a83f7bbb2d768388c42b158 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 14:02:46 +0000
Subject: [PATCH 12/33] Prevent potential null pointer

---
 src/litdata/streaming/resolver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
index 132512047..b9b9a9b8c 100644
--- a/src/litdata/streaming/resolver.py
+++ b/src/litdata/streaming/resolver.py
@@ -52,7 +52,7 @@ def _resolve_dir(dir_path: Optional[Union[str, Path, Dir]]) -> Dir:
         return Dir(
             path=str(dir_path.path) if dir_path.path else None,
             url=str(dir_path.url) if dir_path.url else None,
-            data_connection_id=dir_path.data_connection_id,
+            data_connection_id=dir_path.data_connection_id if dir_path.data_connection_id else None,
         )
 
     if dir_path is None:

From 4943755403afb204420b86f09e06ce81ba086467 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 14:03:28 +0000
Subject: [PATCH 13/33] Add test for lightning_storage resolver

---
 tests/streaming/test_resolver.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py
index 15b8d34b8..f6806817c 100644
--- a/tests/streaming/test_resolver.py
+++ b/tests/streaming/test_resolver.py
@@ -529,7 +529,9 @@ def test_src_resolver_lightning_storage(monkeypatch, lightning_cloud_mock):
 
     client_mock = mock.MagicMock()
     client_mock.data_connection_service_list_data_connections.return_value = V1ListDataConnectionsResponse(
-        data_connections=[V1DataConnection(name="my_dataset", r2=mock.MagicMock(source="r2://my-r2-bucket"))],
+        data_connections=[
+            V1DataConnection(id="data_connection_id", name="my_dataset", r2=mock.MagicMock(source="r2://my-r2-bucket"))
+        ],
     )
 
     client_cls_mock = mock.MagicMock()
@@ -537,6 +539,7 @@ def test_src_resolver_lightning_storage(monkeypatch, lightning_cloud_mock):
     lightning_cloud_mock.rest_client.LightningClient = client_cls_mock
 
     expected = "r2://my-r2-bucket"
+    assert resolver._resolve_dir("/teamspace/lightning_storage/my_dataset").data_connection_id == "data_connection_id"
     assert resolver._resolve_dir("/teamspace/lightning_storage/my_dataset").url == expected
     assert resolver._resolve_dir("/teamspace/lightning_storage/my_dataset/train").url == expected + "/train"
 

From 23d4d61715f1e266e685de8c4af15b1e0e5da251 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 14:14:36 +0000
Subject: [PATCH 14/33] Move code for constructing storage options into a util
 function

---
 src/litdata/processing/data_processor.py | 44 +++++-------------------
 src/litdata/processing/utilities.py      |  7 ++++
 2 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
index 1fac5093a..b3646cc9e 100644
--- a/src/litdata/processing/data_processor.py
+++ b/src/litdata/processing/data_processor.py
@@ -45,7 +45,7 @@
     _TQDM_AVAILABLE,
 )
 from litdata.processing.readers import BaseReader, StreamingDataLoaderReader
-from litdata.processing.utilities import _create_dataset, remove_uuid_from_filename
+from litdata.processing.utilities import _create_dataset, construct_storage_options, remove_uuid_from_filename
 from litdata.streaming import Cache
 from litdata.streaming.cache import Dir
 from litdata.streaming.dataloader import StreamingDataLoader
@@ -168,10 +168,7 @@ def _download_data_target(
                     dirpath = os.path.dirname(local_path)
                     os.makedirs(dirpath, exist_ok=True)
                     if fs_provider is None:
-                        # Add data connection ID to storage_options for R2 connections
-                        merged_storage_options = storage_options.copy()
-                        if hasattr(input_dir, "data_connection_id") and input_dir.data_connection_id:
-                            merged_storage_options["data_connection_id"] = input_dir.data_connection_id
+                        merged_storage_options = construct_storage_options(storage_options, input_dir)
                         fs_provider = _get_fs_provider(input_dir.url, merged_storage_options)
                     fs_provider.download_file(path, local_path)
 
@@ -237,10 +234,7 @@ def _upload_fn(
     obj = parse.urlparse(output_dir.url if output_dir.url else output_dir.path)
 
     if obj.scheme in _SUPPORTED_PROVIDERS:
-        # Add data connection ID to storage_options for R2 connections
-        merged_storage_options = storage_options.copy()
-        if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-            merged_storage_options["data_connection_id"] = output_dir.data_connection_id
+        merged_storage_options = construct_storage_options(storage_options, output_dir)
         fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
 
     while True:
@@ -1030,11 +1024,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
             local_filepath = os.path.join(cache_dir, _INDEX_FILENAME)
 
         if obj.scheme in _SUPPORTED_PROVIDERS:
-            # Add data connection ID to storage_options for R2 connections
-            merged_storage_options = self.storage_options.copy()
-            if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-                merged_storage_options["data_connection_id"] = output_dir.data_connection_id
-
+            merged_storage_options = construct_storage_options(self.storage_options, output_dir)
             fs_provider = _get_fs_provider(output_dir.url, merged_storage_options)
             fs_provider.upload_file(
                 local_filepath,
@@ -1057,11 +1047,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
                 remote_filepath = os.path.join(output_dir_path, f"{node_rank}-{_INDEX_FILENAME}")
                 node_index_filepath = os.path.join(cache_dir, os.path.basename(remote_filepath))
                 if obj.scheme in _SUPPORTED_PROVIDERS:
-                    # Add data connection ID to storage_options for R2 connections
-                    merged_storage_options = self.storage_options.copy()
-                    if hasattr(output_dir, "data_connection_id") and output_dir.data_connection_id:
-                        merged_storage_options["data_connection_id"] = output_dir.data_connection_id
-
+                    merged_storage_options = construct_storage_options(self.storage_options, output_dir)
                     _wait_for_file_to_exist(remote_filepath, storage_options=merged_storage_options)
                     fs_provider = _get_fs_provider(remote_filepath, merged_storage_options)
                     fs_provider.download_file(remote_filepath, node_index_filepath)
@@ -1517,12 +1503,7 @@ def _cleanup_checkpoints(self) -> None:
 
         prefix = self.output_dir.url.rstrip("/") + "/"
         checkpoint_prefix = os.path.join(prefix, ".checkpoints")
-
-        # Add data connection ID to storage_options for R2 connections
-        merged_storage_options = self.storage_options.copy()
-        if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-            merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
-
+        merged_storage_options = construct_storage_options(self.storage_options, self.output_dir)
         fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
         fs_provider.delete_file_or_directory(checkpoint_prefix)
 
@@ -1552,12 +1533,7 @@ def _save_current_config(self, workers_user_items: list[list[Any]]) -> None:
 
             if obj.scheme not in _SUPPORTED_PROVIDERS:
                 not_supported_provider(self.output_dir.url)
-
-            # Add data connection ID to storage_options for R2 connections
-            merged_storage_options = self.storage_options.copy()
-            if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-                merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
-
+            merged_storage_options = construct_storage_options(self.storage_options, self.output_dir)
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
 
             prefix = self.output_dir.url.rstrip("/") + "/" + ".checkpoints/"
@@ -1629,11 +1605,7 @@ def _load_checkpoint_config(self, workers_user_items: list[list[Any]]) -> None:
 
         # download all the checkpoint files in tempdir and read them
         with tempfile.TemporaryDirectory() as temp_dir:
-            # Add data connection ID to storage_options for R2 connections
-            merged_storage_options = self.storage_options.copy()
-            if hasattr(self.output_dir, "data_connection_id") and self.output_dir.data_connection_id:
-                merged_storage_options["data_connection_id"] = self.output_dir.data_connection_id
-
+            merged_storage_options = construct_storage_options(self.storage_options, self.output_dir)
             fs_provider = _get_fs_provider(self.output_dir.url, merged_storage_options)
             saved_file_dir = fs_provider.download_directory(prefix, temp_dir)
 
diff --git a/src/litdata/processing/utilities.py b/src/litdata/processing/utilities.py
index 9c71838bf..504310bc3 100644
--- a/src/litdata/processing/utilities.py
+++ b/src/litdata/processing/utilities.py
@@ -272,3 +272,10 @@ def remove_uuid_from_filename(filepath: str) -> str:
 
     # uuid is of 32 characters, '.json' is 5 characters and '-' is 1 character
     return filepath[:-38] + ".json"
+
+
+def construct_storage_options(storage_options: dict[str, Any], input_dir: Dir) -> dict[str, Any]:
+    merged_storage_options = storage_options.copy()
+    if hasattr(input_dir, "data_connection_id") and input_dir.data_connection_id:
+        merged_storage_options["data_connection_id"] = input_dir.data_connection_id
+    return merged_storage_options

From f0069769ebbf48b6ec5a83533ddefed7c62ff0a2 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 14:28:58 +0000
Subject: [PATCH 15/33] Add retry logic to client for fetching temp creds

---
 src/litdata/streaming/client.py | 42 +++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index c4070407a..c886053e4 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -11,17 +11,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 from time import time
 from typing import Any, Optional
 
 import boto3
 import botocore
+import requests
 from botocore.credentials import InstanceMetadataProvider
 from botocore.utils import InstanceMetadataFetcher
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 
 from litdata.constants import _IS_IN_STUDIO
 
+_CONNECTION_RETRY_TOTAL = 2880
+_CONNECTION_RETRY_BACKOFF_FACTOR = 0.5
+_DEFAULT_REQUEST_TIMEOUT = 30  # seconds
+
+
+class _CustomRetryAdapter(HTTPAdapter):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        self.timeout = kwargs.pop("timeout", _DEFAULT_REQUEST_TIMEOUT)
+        super().__init__(*args, **kwargs)
+
+    def send(self, request: Any, *args: Any, **kwargs: Any) -> Any:
+        kwargs["timeout"] = kwargs.get("timeout", self.timeout)
+        return super().send(request, **kwargs)
+
 
 class S3Client:
     # TODO: Generalize to support more cloud providers.
@@ -95,9 +113,23 @@ def __init__(
 
     def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
         """Fetch temporary R2 credentials for the current lightning storage connection."""
-        import json
-
-        import requests
+        # Create session with retry logic
+        retry_strategy = Retry(
+            total=_CONNECTION_RETRY_TOTAL,
+            backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR,
+            status_forcelist=[
+                408,  # Request Timeout
+                429,  # Too Many Requests
+                500,  # Internal Server Error
+                502,  # Bad Gateway
+                503,  # Service Unavailable
+                504,  # Gateway Timeout
+            ],
+        )
+        adapter = _CustomRetryAdapter(max_retries=retry_strategy, timeout=_DEFAULT_REQUEST_TIMEOUT)
+        session = requests.Session()
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
 
         try:
             # Get Lightning Cloud API token
@@ -112,7 +144,7 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
             # Login to get token
             payload = {"apiKey": api_key, "username": username}
             login_url = f"{cloud_url}/v1/auth/login"
-            response = requests.post(login_url, data=json.dumps(payload))  # noqa: S113
+            response = session.post(login_url, data=json.dumps(payload))
 
             if "token" not in response.json():
                 raise RuntimeError("Failed to get authentication token")
@@ -125,7 +157,7 @@ def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
                 f"{cloud_url}/v1/projects/{project_id}/data-connections/{data_connection_id}/temp-bucket-credentials"
             )
 
-            credentials_response = requests.get(credentials_url, headers=headers, timeout=10)
+            credentials_response = session.get(credentials_url, headers=headers, timeout=10)
 
             if credentials_response.status_code != 200:
                 raise RuntimeError(f"Failed to get credentials: {credentials_response.status_code}")

From 09699ce1b93dbc711f26215d57778a09b47cddbe Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 14:37:08 +0000
Subject: [PATCH 16/33] R2 client and fsProvder should inherit from the s3
 counterparts

---
 src/litdata/streaming/client.py      | 35 +++++++++++-----------------
 src/litdata/streaming/fs_provider.py | 25 +-------------------
 2 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index c886053e4..3d9c7acde 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -96,7 +96,7 @@ def client(self) -> Any:
         return self._client
 
 
-class R2Client:
+class R2Client(S3Client):
     """R2 client with refreshable credentials for Cloudflare R2 storage."""
 
     def __init__(
@@ -105,11 +105,15 @@ def __init__(
         storage_options: Optional[dict] = {},
         session_options: Optional[dict] = {},
     ) -> None:
-        self._refetch_interval = refetch_interval
-        self._last_time: Optional[float] = None
-        self._client: Optional[Any] = None
+        # Store R2-specific options before calling super()
         self._base_storage_options: dict = storage_options or {}
-        self._session_options: dict = session_options or {}
+
+        # Call parent constructor with R2-specific refetch interval
+        super().__init__(
+            refetch_interval=refetch_interval,
+            storage_options={},  # storage options handled in _create_client
+            session_options=session_options,
+        )
 
     def get_r2_bucket_credentials(self, data_connection_id: str) -> dict[str, str]:
         """Fetch temporary R2 credentials for the current lightning storage connection."""
@@ -195,7 +199,10 @@ def _create_client(self) -> None:
         }
 
         # Combine filtered storage options with fresh credentials
-        storage_options = {**filtered_storage_options, **r2_credentials}
+        combined_storage_options = {**filtered_storage_options, **r2_credentials}
+
+        # Update the inherited storage options with R2 credentials
+        self._storage_options = combined_storage_options
 
         # Create session and client
         session = boto3.Session(**self._session_options)
@@ -203,20 +210,6 @@ def _create_client(self) -> None:
             "s3",
             **{
                 "config": botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"}),
-                **storage_options,
+                **combined_storage_options,
             },
         )
-
-    @property
-    def client(self) -> Any:
-        """Get the R2 client, refreshing credentials if necessary."""
-        if self._client is None:
-            self._create_client()
-            self._last_time = time()
-
-        # Re-generate credentials when they expire
-        if self._last_time is None or (time() - self._last_time) > self._refetch_interval:
-            self._create_client()
-            self._last_time = time()
-
-        return self._client
diff --git a/src/litdata/streaming/fs_provider.py b/src/litdata/streaming/fs_provider.py
index f1a3683db..24e62df56 100644
--- a/src/litdata/streaming/fs_provider.py
+++ b/src/litdata/streaming/fs_provider.py
@@ -224,7 +224,7 @@ def is_empty(self, path: str) -> bool:
         return not objects["KeyCount"] > 0
 
 
-class R2FsProvider(FsProvider):
+class R2FsProvider(S3FsProvider):
     def __init__(self, storage_options: Optional[dict[str, Any]] = {}):
         super().__init__(storage_options=storage_options)
 
@@ -268,18 +268,6 @@ def download_directory(self, remote_path: str, local_directory_name: str) -> str
 
         return saved_file_dir
 
-    def copy(self, remote_source: str, remote_destination: str) -> None:
-        input_obj = parse.urlparse(remote_source)
-        output_obj = parse.urlparse(remote_destination)
-        self.client.client.copy(
-            {"Bucket": input_obj.netloc, "Key": input_obj.path.lstrip("/")},
-            output_obj.netloc,
-            output_obj.path.lstrip("/"),
-        )
-
-    def list_directory(self, path: str) -> list[str]:
-        raise NotImplementedError
-
     def delete_file_or_directory(self, path: str) -> None:
         """Delete the file or the directory."""
         bucket_name, blob_path = get_bucket_and_path(path, "r2")
@@ -306,17 +294,6 @@ def exists(self, path: str) -> bool:
         except Exception as e:
             raise e
 
-    def is_empty(self, path: str) -> bool:
-        obj = parse.urlparse(path)
-
-        objects = self.client.client.list_objects_v2(
-            Bucket=obj.netloc,
-            Delimiter="/",
-            Prefix=obj.path.lstrip("/").rstrip("/") + "/",
-        )
-
-        return not objects["KeyCount"] > 0
-
 
 def get_bucket_and_path(remote_filepath: str, expected_scheme: str = "s3") -> tuple[str, str]:
     """Parse the remote filepath and return the bucket name and the blob path.

From 97576caeb2108db7e28d3a36c79ee9f4f655019d Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 17:17:04 +0100
Subject: [PATCH 17/33] udpate

---
 tests/streaming/test_client.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
index 39cede540..ba15ba4fd 100644
--- a/tests/streaming/test_client.py
+++ b/tests/streaming/test_client.py
@@ -153,8 +153,7 @@ def test_r2_client_get_r2_bucket_credentials_success(monkeypatch):
 
     # Mock requests
     requests_mock = mock.MagicMock()
-    monkeypatch.setattr("requests.post", requests_mock)
-    monkeypatch.setattr("requests.get", requests_mock)
+    monkeypatch.setattr("requests.Session", mock.MagicMock(return_value=requests_mock))
 
     # Mock login response
     login_response = mock.MagicMock()
@@ -176,7 +175,9 @@ def mock_request(*args, **kwargs):
             return login_response
         return credentials_response
 
-    requests_mock.side_effect = mock_request
+    requests_mock.post = mock_request
+    requests_mock.get = mock_request
+
     monkeypatch.setattr("requests.get", lambda *args, **kwargs: credentials_response)
 
     r2_client = client.R2Client()
@@ -218,7 +219,7 @@ def test_r2_client_get_r2_bucket_credentials_login_failure(monkeypatch):
     login_response.json.return_value = {"error": "Invalid credentials"}
 
     requests_mock = mock.MagicMock(return_value=login_response)
-    monkeypatch.setattr("requests.post", requests_mock)
+    monkeypatch.setattr("requests.Session", mock.MagicMock(return_value=requests_mock))
 
     r2_client = client.R2Client()
 

From bcbee7596160af155b78d8e3cb31ceb2d0350ae6 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 17:19:20 +0100
Subject: [PATCH 18/33] udpate

---
 tests/streaming/test_client.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
index ba15ba4fd..97f9bc0de 100644
--- a/tests/streaming/test_client.py
+++ b/tests/streaming/test_client.py
@@ -243,8 +243,11 @@ def test_r2_client_get_r2_bucket_credentials_api_failure(monkeypatch):
     credentials_response = mock.MagicMock()
     credentials_response.status_code = 403
 
-    monkeypatch.setattr("requests.post", mock.MagicMock(return_value=login_response))
-    monkeypatch.setattr("requests.get", mock.MagicMock(return_value=credentials_response))
+    # Mock requests
+    requests_mock = mock.MagicMock()
+    monkeypatch.setattr("requests.Session", mock.MagicMock(return_value=requests_mock))
+    requests_mock.post = mock.MagicMock(return_value=login_response)
+    requests_mock.get = mock.MagicMock(return_value=credentials_response)
 
     r2_client = client.R2Client()
 

From 052d3f173730dfaca77d850cced84faeef61f66f Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 17:20:19 +0100
Subject: [PATCH 19/33] udpate

---
 tests/streaming/test_client.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
index 97f9bc0de..ace4fe9f3 100644
--- a/tests/streaming/test_client.py
+++ b/tests/streaming/test_client.py
@@ -456,8 +456,10 @@ def test_r2_client_api_call_format(monkeypatch):
     }
     mock_get.return_value = credentials_response
 
-    monkeypatch.setattr("requests.post", mock_post)
-    monkeypatch.setattr("requests.get", mock_get)
+    requests_mock = mock.MagicMock()
+    monkeypatch.setattr("requests.Session", mock.MagicMock(return_value=requests_mock))
+    requests_mock.post = mock_post
+    requests_mock.get = mock_get
 
     r2_client = client.R2Client()
     r2_client.get_r2_bucket_credentials("conn-abc123")

From 8891e0f4352b71d278c5213f67abdc705118c2ff Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:00:12 +0100
Subject: [PATCH 20/33] udpate

---
 .github/workflows/ci-testing.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 1f9a5a20b..7b97216e7 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -50,14 +50,14 @@ jobs:
           pytest tests \
             --ignore=tests/processing \
             --ignore=tests/raw \
-            -n 2 --cov=litdata --durations=120
+            -n 2 --cov=litdata --durations=120 --timeout=120
 
       - name: Run processing tests sequentially
         run: |
           # note that the listed test should match ignored in the previous step
           pytest \
             tests/processing tests/raw \
-            --cov=litdata --cov-append --durations=90
+            --cov=litdata --cov-append --durations=90 --timeout=120
 
       - name: Statistics
         continue-on-error: true

From 81de16e97a5fa1c486969f6354674e65aa7f91fe Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:17:40 +0100
Subject: [PATCH 21/33] udpate

---
 .github/workflows/ci-testing.yml |  9 +++++----
 tests/streaming/test_dataset.py  | 12 +++++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 7b97216e7..d9b7bcba6 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -48,16 +48,17 @@ jobs:
       - name: Run fast tests in parallel
         run: |
           pytest tests \
-            --ignore=tests/processing \
-            --ignore=tests/raw \
-            -n 2 --cov=litdata --durations=120 --timeout=120
+              --ignore=tests/processing \
+              --ignore=tests/raw \
+              -n 2 --cov=litdata --durations=0 --timeout=120 --capture=no --verbose
 
       - name: Run processing tests sequentially
         run: |
           # note that the listed test should match ignored in the previous step
           pytest \
             tests/processing tests/raw \
-            --cov=litdata --cov-append --durations=90 --timeout=120
+            --cov=litdata --cov-append
+            -n 2 --durations=0 --timeout=120 --capture=no --verbose
 
       - name: Statistics
         continue-on-error: true
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
index 4332bec79..a1db6441e 100644
--- a/tests/streaming/test_dataset.py
+++ b/tests/streaming/test_dataset.py
@@ -107,21 +107,23 @@ def _simple_optimize_fn(index):
 @pytest.mark.parametrize(
     ("chunk_bytes", "chunk_size"),
     [
-        ("64MB", None),
-        (None, 5),  # at max 5 items in a chunk
-        (None, 75),  # at max 75 items in a chunk
         (None, 1200),  # at max 1200 items in a chunk
     ],
 )
 @pytest.mark.parametrize("keep_data_ordered", [True, False])
-def test_optimize_dataset(keep_data_ordered, chunk_bytes, chunk_size, tmpdir, monkeypatch):
+def test_optimize_dataset(
+    keep_data_ordered,
+    chunk_bytes,
+    chunk_size,
+    tmpdir,
+):
     data_dir = str(tmpdir / "optimized")
 
     optimize(
         fn=_simple_optimize_fn,
         inputs=list(range(1000)),
         output_dir=data_dir,
-        num_workers=4,
+        num_workers=2,
         chunk_bytes=chunk_bytes,
         chunk_size=chunk_size,
         keep_data_ordered=keep_data_ordered,

From 932e28470e5cb19c709e718b5c7dc5839f91a9d9 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:18:04 +0100
Subject: [PATCH 22/33] udpate

---
 .github/workflows/ci-testing.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index d9b7bcba6..16f0a9dd9 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -56,7 +56,7 @@ jobs:
         run: |
           # note that the listed test should match ignored in the previous step
           pytest \
-            tests/processing tests/raw \
+            tests/raw \
             --cov=litdata --cov-append
             -n 2 --durations=0 --timeout=120 --capture=no --verbose
 

From 5346a2a06ed08130163d3c5a9103366b2e7dfa0f Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:25:44 +0100
Subject: [PATCH 23/33] udpate

---
 .github/workflows/ci-testing.yml | 2 +-
 tests/streaming/test_dataset.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 16f0a9dd9..d9b7bcba6 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -56,7 +56,7 @@ jobs:
         run: |
           # note that the listed test should match ignored in the previous step
           pytest \
-            tests/raw \
+            tests/processing tests/raw \
             --cov=litdata --cov-append
             -n 2 --durations=0 --timeout=120 --capture=no --verbose
 
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
index a1db6441e..ce06bb119 100644
--- a/tests/streaming/test_dataset.py
+++ b/tests/streaming/test_dataset.py
@@ -107,7 +107,7 @@ def _simple_optimize_fn(index):
 @pytest.mark.parametrize(
     ("chunk_bytes", "chunk_size"),
     [
-        (None, 1200),  # at max 1200 items in a chunk
+        (None, 10),
     ],
 )
 @pytest.mark.parametrize("keep_data_ordered", [True, False])
@@ -121,7 +121,7 @@ def test_optimize_dataset(
 
     optimize(
         fn=_simple_optimize_fn,
-        inputs=list(range(1000)),
+        inputs=list(range(20)),
         output_dir=data_dir,
         num_workers=2,
         chunk_bytes=chunk_bytes,
@@ -133,7 +133,7 @@ def test_optimize_dataset(
 
     ds = StreamingDataset(input_dir=data_dir)
 
-    expected_dataset = list(range(1000))
+    expected_dataset = list(range(20))
     actual_dataset = ds[:]
 
     assert len(actual_dataset) == len(expected_dataset)

From 9a59d89111ea579e84d6c7cc37ee8478f1a64219 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:37:45 +0100
Subject: [PATCH 24/33] udpate

---
 .github/workflows/ci-testing.yml |  3 +--
 src/litdata/streaming/config.py  |  8 ++++++++
 tests/streaming/test_dataset.py  | 14 +++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index d9b7bcba6..323647445 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -57,8 +57,7 @@ jobs:
           # note that the listed test should match ignored in the previous step
           pytest \
             tests/processing tests/raw \
-            --cov=litdata --cov-append
-            -n 2 --durations=0 --timeout=120 --capture=no --verbose
+            --cov=litdata --cov-append --durations=0 --timeout=120 --capture=no --verbose
 
       - name: Statistics
         continue-on-error: true
diff --git a/src/litdata/streaming/config.py b/src/litdata/streaming/config.py
index 8a8325a8c..dd7af253a 100644
--- a/src/litdata/streaming/config.py
+++ b/src/litdata/streaming/config.py
@@ -127,6 +127,8 @@ def skip_chunk_indexes_deletion(self, skip_chunk_indexes_deletion: list[int]) ->
         self._skip_chunk_indexes_deletion = skip_chunk_indexes_deletion
 
     def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -> None:
+        from time import time
+
         assert self._chunks is not None
         chunk_filename = self._chunks[chunk_index]["filename"]
 
@@ -134,6 +136,8 @@ def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -
 
         if os.path.exists(local_chunkpath):
             self.try_decompress(local_chunkpath)
+            from time import time
+
             if self._downloader is not None and not skip_lock:
                 # We don't want to redownload the base, but we should mark
                 # it as having been requested by something
@@ -151,6 +155,10 @@ def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -
 
         self.try_decompress(local_chunkpath)
 
+        from time import time
+
+        print("HERE 2", time())
+
     def download_chunk_bytes_from_index(self, chunk_index: int, offset: int, length: int) -> bytes:
         assert self._chunks is not None
         chunk_filename = self._chunks[chunk_index]["filename"]
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
index ce06bb119..9c894895c 100644
--- a/tests/streaming/test_dataset.py
+++ b/tests/streaming/test_dataset.py
@@ -357,12 +357,12 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr
 @pytest.mark.parametrize(
     "compression",
     [
-        pytest.param(None),
+        # pytest.param(None),
         pytest.param(
             "zstd",
-            marks=pytest.mark.skipif(
-                condition=not _ZSTD_AVAILABLE or sys.platform == "darwin", reason="Requires: ['zstd']"
-            ),
+            # marks=pytest.mark.skipif(
+            #     condition=not _ZSTD_AVAILABLE or sys.platform == "darwin", reason="Requires: ['zstd']"
+            # ),
         ),
     ],
 )
@@ -370,7 +370,7 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr
 def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir, compression):
     seed_everything(42)
 
-    cache = Cache(str(tmpdir), chunk_size=10, compression=compression)
+    cache = Cache(str(tmpdir), chunk_size=500, compression=compression)
     for i in range(1222):
         cache[i] = i
 
@@ -390,7 +390,7 @@ def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir, comp
     dataset_iter = iter(dataset)
     assert len(dataset_iter) == 611
     process_1_1 = list(dataset_iter)
-    assert process_1_1[:10] == [278, 272, 270, 273, 276, 275, 274, 271, 277, 279]
+    assert process_1_1[:10] == [1093, 1186, 1031, 1128, 1126, 1051, 1172, 1052, 1120, 1209]
     assert len(process_1_1) == 611
 
     dataset_2 = StreamingDataset(input_dir=str(tmpdir), shuffle=True, drop_last=drop_last)
@@ -401,7 +401,7 @@ def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir, comp
     dataset_2_iter = iter(dataset_2)
     assert len(dataset_2_iter) == 611
     process_2_1 = list(dataset_2_iter)
-    assert process_2_1[:10] == [999, 993, 991, 994, 997, 996, 995, 992, 998, 527]
+    assert process_2_1[:10] == [967, 942, 893, 913, 982, 898, 947, 901, 894, 961]
     assert len(process_2_1) == 611
     assert len([i for i in process_1_1 if i in process_2_1]) == 0
 

From e3e126a2dbf3c781996c0af121b562346b2e4bd0 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:38:41 +0100
Subject: [PATCH 25/33] udpate

---
 tests/streaming/test_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
index 9c894895c..3c957ed18 100644
--- a/tests/streaming/test_dataset.py
+++ b/tests/streaming/test_dataset.py
@@ -357,12 +357,12 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr
 @pytest.mark.parametrize(
     "compression",
     [
-        # pytest.param(None),
+        pytest.param(None),
         pytest.param(
             "zstd",
-            # marks=pytest.mark.skipif(
-            #     condition=not _ZSTD_AVAILABLE or sys.platform == "darwin", reason="Requires: ['zstd']"
-            # ),
+            marks=pytest.mark.skipif(
+                condition=not _ZSTD_AVAILABLE or sys.platform == "darwin", reason="Requires: ['zstd']"
+            ),
         ),
     ],
 )

From fc89b31ac2189c6fddb93c188631871b44d1c282 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 19:52:01 +0100
Subject: [PATCH 26/33] udpate

---
 src/litdata/streaming/reader.py          | 1 +
 tests/utilities/test_train_test_split.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/src/litdata/streaming/reader.py b/src/litdata/streaming/reader.py
index c186de059..f1be06c55 100644
--- a/src/litdata/streaming/reader.py
+++ b/src/litdata/streaming/reader.py
@@ -455,6 +455,7 @@ def read(self, index: ChunkedIndex) -> Any:
                 try:
                     self._prepare_thread.join(timeout=_LONG_DEFAULT_TIMEOUT)
                 except Timeout:
+                    breakpoint()
                     logger.warning(
                         "The prepare chunks thread didn't exit properly. "
                         "This can happen if the chunk files are too large."
diff --git a/tests/utilities/test_train_test_split.py b/tests/utilities/test_train_test_split.py
index 5699cc5b1..2e2f039d5 100644
--- a/tests/utilities/test_train_test_split.py
+++ b/tests/utilities/test_train_test_split.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 
 from litdata import StreamingDataLoader, StreamingDataset, train_test_split
@@ -117,10 +119,12 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
         pytest.param("zstd", marks=pytest.mark.skipif(condition=not _ZSTD_AVAILABLE, reason="Requires: ['zstd']")),
     ],
 )
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="slow on windows")
 def test_train_test_split_with_shuffle_parameter(tmpdir, compression):
     cache = Cache(str(tmpdir), chunk_size=10, compression=compression)
     for i in range(100):
         cache[i] = i
+
     cache.done()
     cache.merge()
 

From 09da90351182f034857ed8ccd7e3b82662804839 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 20:00:11 +0100
Subject: [PATCH 27/33] udpate

---
 tests/utilities/test_train_test_split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utilities/test_train_test_split.py b/tests/utilities/test_train_test_split.py
index 2e2f039d5..aafd59b33 100644
--- a/tests/utilities/test_train_test_split.py
+++ b/tests/utilities/test_train_test_split.py
@@ -112,6 +112,7 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
                 visited_indices.add(curr_idx)
 
 
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="slow on windows")
 @pytest.mark.parametrize(
     "compression",
     [
@@ -119,7 +120,6 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
         pytest.param("zstd", marks=pytest.mark.skipif(condition=not _ZSTD_AVAILABLE, reason="Requires: ['zstd']")),
     ],
 )
-@pytest.mark.skipif(condition=sys.platform == "win32", reason="slow on windows")
 def test_train_test_split_with_shuffle_parameter(tmpdir, compression):
     cache = Cache(str(tmpdir), chunk_size=10, compression=compression)
     for i in range(100):

From a41e54a533885b5938ba23cc74fc17827260c4fe Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 20:22:54 +0100
Subject: [PATCH 28/33] udpate

---
 tests/utilities/test_train_test_split.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/utilities/test_train_test_split.py b/tests/utilities/test_train_test_split.py
index aafd59b33..1275e072e 100644
--- a/tests/utilities/test_train_test_split.py
+++ b/tests/utilities/test_train_test_split.py
@@ -112,12 +112,16 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
                 visited_indices.add(curr_idx)
 
 
-@pytest.mark.skipif(condition=sys.platform == "win32", reason="slow on windows")
 @pytest.mark.parametrize(
     "compression",
     [
-        pytest.param(None),
-        pytest.param("zstd", marks=pytest.mark.skipif(condition=not _ZSTD_AVAILABLE, reason="Requires: ['zstd']")),
+        pytest.param(None, marks=pytest.mark.skipif(sys.platform == "win32", reason="slow on windows")),
+        pytest.param(
+            "zstd",
+            marks=pytest.mark.skipif(
+                condition=not _ZSTD_AVAILABLE or sys.platform == "win32", reason="Requires: ['zstd']"
+            ),
+        ),
     ],
 )
 def test_train_test_split_with_shuffle_parameter(tmpdir, compression):

From 7f5d6e6c0238e58fe31474ccd04e49eebf04af30 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 20:25:50 +0100
Subject: [PATCH 29/33] udpate

---
 tests/processing/test_data_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py
index b5d321efd..b201bcaed 100644
--- a/tests/processing/test_data_processor.py
+++ b/tests/processing/test_data_processor.py
@@ -1359,6 +1359,7 @@ def fn(*_, **__):
     fs_provider = mock.MagicMock()
     get_fs_provider_mock = mock.MagicMock(return_value=fs_provider)
     monkeypatch.setattr(data_processor_module, "_get_fs_provider", get_fs_provider_mock)
+    monkeypatch.setattr(data_processor_module, "_wait_for_disk_usage_higher_than_threshold", mock.MagicMock())
 
     storage_options = {"key": "value"}
 

From e7cd0145fa7743c84472508780a3fbfec700410b Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 20:46:44 +0100
Subject: [PATCH 30/33] udpate

---
 tests/utilities/test_train_test_split.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utilities/test_train_test_split.py b/tests/utilities/test_train_test_split.py
index 1275e072e..65f586682 100644
--- a/tests/utilities/test_train_test_split.py
+++ b/tests/utilities/test_train_test_split.py
@@ -125,6 +125,9 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
     ],
 )
 def test_train_test_split_with_shuffle_parameter(tmpdir, compression):
+    if sys.platform == "win32":
+        pytest.skip("slow on windows")
+
     cache = Cache(str(tmpdir), chunk_size=10, compression=compression)
     for i in range(100):
         cache[i] = i

From d226cb7ada93ac0d30828f39afafc9cec61181c6 Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 20:55:42 +0100
Subject: [PATCH 31/33] udpate

---
 tests/utilities/test_train_test_split.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/utilities/test_train_test_split.py b/tests/utilities/test_train_test_split.py
index 65f586682..f34ae27b8 100644
--- a/tests/utilities/test_train_test_split.py
+++ b/tests/utilities/test_train_test_split.py
@@ -1,3 +1,4 @@
+import platform
 import sys
 
 import pytest
@@ -6,6 +7,8 @@
 from litdata.constants import _ZSTD_AVAILABLE
 from litdata.streaming.cache import Cache
 
+IS_WINDOWS = sys.platform.startswith("win") or platform.system() == "Windows"
+
 
 @pytest.mark.parametrize(
     "compression",
@@ -115,19 +118,14 @@ def test_train_test_split_with_streaming_dataloader(tmpdir, compression):
 @pytest.mark.parametrize(
     "compression",
     [
-        pytest.param(None, marks=pytest.mark.skipif(sys.platform == "win32", reason="slow on windows")),
+        pytest.param(None, marks=pytest.mark.skipif(IS_WINDOWS, reason="slow on windows")),
         pytest.param(
             "zstd",
-            marks=pytest.mark.skipif(
-                condition=not _ZSTD_AVAILABLE or sys.platform == "win32", reason="Requires: ['zstd']"
-            ),
+            marks=pytest.mark.skipif(not _ZSTD_AVAILABLE or IS_WINDOWS, reason="Requires: ['zstd']"),
         ),
     ],
 )
 def test_train_test_split_with_shuffle_parameter(tmpdir, compression):
-    if sys.platform == "win32":
-        pytest.skip("slow on windows")
-
     cache = Cache(str(tmpdir), chunk_size=10, compression=compression)
     for i in range(100):
         cache[i] = i

From 04e690adbe5a51f9840b3144045ac8cca22a08ac Mon Sep 17 00:00:00 2001
From: tchaton <thomas.chaton.ai@gmail.com>
Date: Mon, 8 Sep 2025 21:05:18 +0100
Subject: [PATCH 32/33] udpate

---
 tests/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index e4eb8d3ce..370a133f4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -157,6 +157,8 @@ def _thread_police():
         elif thread.name == "QueueFeederThread":
             thread.join(timeout=20)
         else:
+            if thread.name.startswith("pytest_timeout"):
+                continue
             raise AssertionError(f"Test left zombie thread: {thread}")
 
 

From 3336fe136c31febf0c0f1fc7f331c60329668212 Mon Sep 17 00:00:00 2001
From: Peyton Gardipee <peyton@lightning.ai>
Date: Mon, 8 Sep 2025 20:58:21 +0000
Subject: [PATCH 33/33] Address PR comments

---
 src/litdata/streaming/client.py | 4 ++++
 src/litdata/streaming/config.py | 7 -------
 src/litdata/streaming/reader.py | 1 -
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/litdata/streaming/client.py b/src/litdata/streaming/client.py
index 3d9c7acde..26f43fbf1 100644
--- a/src/litdata/streaming/client.py
+++ b/src/litdata/streaming/client.py
@@ -26,8 +26,12 @@
 
 from litdata.constants import _IS_IN_STUDIO
 
+# Constants for the retry adapter. Docs: https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html
+# Maximum number of total connection retry attempts (e.g., 2880 retries = 24 hours with 30s timeout per request)
 _CONNECTION_RETRY_TOTAL = 2880
+# Backoff factor for connection retries (wait time increases by this factor after each failure)
 _CONNECTION_RETRY_BACKOFF_FACTOR = 0.5
+# Default timeout for each HTTP request in seconds
 _DEFAULT_REQUEST_TIMEOUT = 30  # seconds
 
 
diff --git a/src/litdata/streaming/config.py b/src/litdata/streaming/config.py
index dd7af253a..6f8f6aa36 100644
--- a/src/litdata/streaming/config.py
+++ b/src/litdata/streaming/config.py
@@ -127,8 +127,6 @@ def skip_chunk_indexes_deletion(self, skip_chunk_indexes_deletion: list[int]) ->
         self._skip_chunk_indexes_deletion = skip_chunk_indexes_deletion
 
     def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -> None:
-        from time import time
-
         assert self._chunks is not None
         chunk_filename = self._chunks[chunk_index]["filename"]
 
@@ -136,7 +134,6 @@ def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -
 
         if os.path.exists(local_chunkpath):
             self.try_decompress(local_chunkpath)
-            from time import time
 
             if self._downloader is not None and not skip_lock:
                 # We don't want to redownload the base, but we should mark
@@ -155,10 +152,6 @@ def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -
 
         self.try_decompress(local_chunkpath)
 
-        from time import time
-
-        print("HERE 2", time())
-
     def download_chunk_bytes_from_index(self, chunk_index: int, offset: int, length: int) -> bytes:
         assert self._chunks is not None
         chunk_filename = self._chunks[chunk_index]["filename"]
diff --git a/src/litdata/streaming/reader.py b/src/litdata/streaming/reader.py
index f1be06c55..c186de059 100644
--- a/src/litdata/streaming/reader.py
+++ b/src/litdata/streaming/reader.py
@@ -455,7 +455,6 @@ def read(self, index: ChunkedIndex) -> Any:
                 try:
                     self._prepare_thread.join(timeout=_LONG_DEFAULT_TIMEOUT)
                 except Timeout:
-                    breakpoint()
                     logger.warning(
                         "The prepare chunks thread didn't exit properly. "
                         "This can happen if the chunk files are too large."