Skip to content

Commit

Permalink
apacheGH-34386: [C++] Add a PathFromUriOrPath method (apache#34420)
Browse files Browse the repository at this point in the history
### Rationale for this change

We have some URI parsing indirectly exposed through FilesystemFromUri.  However, this isn't very useful if the user has multiple URIs or if they already have a filesystem.  This method allows that same URI handling to be used even if the user already has a filesystem.

### What changes are included in this PR?

Adds a new arrow::fs::PathFromUriOrPath method

### Are these changes tested?

Yes, via unit tests

### Are there any user-facing changes?

There is a new API method but no changes to any existing APIs
* Closes: apache#34386

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Weston Pace <weston.pace@gmail.com>
  • Loading branch information
westonpace authored and liujiacheng777 committed May 11, 2023
1 parent e71b28c commit 169db77
Show file tree
Hide file tree
Showing 20 changed files with 303 additions and 68 deletions.
33 changes: 15 additions & 18 deletions cpp/src/arrow/filesystem/filesystem.cc
Expand Up @@ -60,6 +60,7 @@ using internal::ConcatAbstractPath;
using internal::EnsureTrailingSlash;
using internal::GetAbstractPathParent;
using internal::kSep;
using internal::ParseFileSystemUri;
using internal::RemoveLeadingSlash;
using internal::RemoveTrailingSlash;
using internal::ToSlashes;
Expand Down Expand Up @@ -254,6 +255,10 @@ Result<std::shared_ptr<io::OutputStream>> FileSystem::OpenAppendStream(
return OpenAppendStream(path, std::shared_ptr<const KeyValueMetadata>{});
}

Result<std::string> FileSystem::PathFromUri(const std::string& uri_string) const {
return Status::NotImplemented("PathFromUri is not yet supported on this filesystem");
}

//////////////////////////////////////////////////////////////////////////
// SubTreeFileSystem implementation

Expand Down Expand Up @@ -484,6 +489,10 @@ Result<std::shared_ptr<io::OutputStream>> SubTreeFileSystem::OpenAppendStream(
return base_fs_->OpenAppendStream(real_path, metadata);
}

Result<std::string> SubTreeFileSystem::PathFromUri(const std::string& uri_string) const {
return base_fs_->PathFromUri(uri_string);
}

//////////////////////////////////////////////////////////////////////////
// SlowFileSystem implementation

Expand All @@ -505,6 +514,10 @@ SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem> base_fs,

bool SlowFileSystem::Equals(const FileSystem& other) const { return this == &other; }

Result<std::string> SlowFileSystem::PathFromUri(const std::string& uri_string) const {
return base_fs_->PathFromUri(uri_string);
}

Result<FileInfo> SlowFileSystem::GetFileInfo(const std::string& path) {
latencies_->Sleep();
return base_fs_->GetFileInfo(path);
Expand Down Expand Up @@ -662,23 +675,6 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,

namespace {

Result<Uri> ParseFileSystemUri(const std::string& uri_string) {
Uri uri;
auto status = uri.Parse(uri_string);
if (!status.ok()) {
#ifdef _WIN32
// Could be a "file:..." URI with backslashes instead of regular slashes.
RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string)));
if (uri.scheme() != "file") {
return status;
}
#else
return status;
#endif
}
return std::move(uri);
}

Result<std::shared_ptr<FileSystem>> FileSystemFromUriReal(const Uri& uri,
const std::string& uri_string,
const io::IOContext& io_context,
Expand Down Expand Up @@ -763,7 +759,8 @@ Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
if (internal::DetectAbsolutePath(uri_string)) {
// Normalize path separators
if (out_path != nullptr) {
*out_path = ToSlashes(uri_string);
*out_path =
std::string(RemoveTrailingSlash(ToSlashes(uri_string), /*preserve_root=*/true));
}
return std::make_shared<LocalFileSystem>();
}
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Expand Up @@ -171,6 +171,26 @@ class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem>
/// may allow normalizing irregular path forms (such as Windows local paths).
virtual Result<std::string> NormalizePath(std::string path);

/// \brief Ensure a URI (or path) is compatible with the given filesystem and return the
/// path
///
/// \param uri_string A URI representing a resource in the given filesystem.
///
/// This method will check to ensure the given filesystem is compatible with the
/// URI. This can be useful when the user provides both a URI and a filesystem or
/// when a user provides multiple URIs that should be compatible with the same
/// filesystem.
///
/// uri_string can be an absolute path instead of a URI. In that case it will ensure
/// the filesystem (if supplied) is the local filesystem (or some custom filesystem that
/// is capable of reading local paths) and will normalize the path's file separators.
///
/// Note, this method only checks to ensure the URI scheme is valid. It will not detect
/// inconsistencies like a mismatching region or endpoint override.
///
/// \return The path inside the filesystem that is indicated by the URI.
virtual Result<std::string> PathFromUri(const std::string& uri_string) const;

virtual bool Equals(const FileSystem& other) const = 0;

virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
Expand Down Expand Up @@ -336,6 +356,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }

Result<std::string> NormalizePath(std::string path) override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;

bool Equals(const FileSystem& other) const override;

Expand Down Expand Up @@ -410,6 +431,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {

std::string type_name() const override { return "slow"; }
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;

using FileSystem::GetFileInfo;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/gcsfs.cc
Expand Up @@ -873,6 +873,12 @@ bool GcsFileSystem::Equals(const FileSystem& other) const {
return impl_->options().Equals(fs.impl_->options());
}

Result<std::string> GcsFileSystem::PathFromUri(const std::string& uri_string) const {
return internal::PathFromUriHelper(uri_string, {"gs", "gcs"},
/*accept_local_paths=*/false,
internal::AuthorityHandlingBehavior::kPrepend);
}

Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) {
ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
return impl_->GetFileInfo(p);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/gcsfs.h
Expand Up @@ -178,6 +178,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem {
const GcsOptions& options() const;

bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;

Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Expand Down
17 changes: 15 additions & 2 deletions cpp/src/arrow/filesystem/gcsfs_test.cc
Expand Up @@ -54,6 +54,7 @@
#include "arrow/filesystem/path_util.h"
#include "arrow/filesystem/test_util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/util/future.h"
#include "arrow/util/key_value_metadata.h"
Expand Down Expand Up @@ -1383,12 +1384,24 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) {

TEST_F(GcsIntegrationTest, TestFileSystemFromUri) {
// Smoke test for FileSystemFromUri
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(std::string("gs://anonymous@") +
PreexistingBucketPath()));
std::string path;
ASSERT_OK_AND_ASSIGN(
auto fs,
FileSystemFromUri(std::string("gs://anonymous@") + PreexistingBucketPath(), &path));
EXPECT_EQ(fs->type_name(), "gcs");
EXPECT_EQ(path, PreexistingBucketName());
ASSERT_OK_AND_ASSIGN(
path, fs->PathFromUri(std::string("gs://anonymous@") + PreexistingBucketPath()));
EXPECT_EQ(path, PreexistingBucketName());
ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") +
PreexistingBucketPath()));
EXPECT_EQ(fs2->type_name(), "gcs");
ASSERT_THAT(fs->PathFromUri("/foo/bar"),
Raises(StatusCode::Invalid, testing::HasSubstr("Expected a URI")));
ASSERT_THAT(
fs->PathFromUri("s3:///foo/bar"),
Raises(StatusCode::Invalid,
testing::HasSubstr("expected a URI with one of the schemes (gs, gcs)")));
}

} // namespace
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/hdfs.cc
Expand Up @@ -473,6 +473,12 @@ bool HadoopFileSystem::Equals(const FileSystem& other) const {
return options().Equals(hdfs.options());
}

Result<std::string> HadoopFileSystem::PathFromUri(const std::string& uri_string) const {
return internal::PathFromUriHelper(uri_string, {"hdfs", "viewfs"},
/*accept_local_paths=*/false,
internal::AuthorityHandlingBehavior::kIgnore);
}

Result<std::vector<FileInfo>> HadoopFileSystem::GetFileInfo(const FileSelector& select) {
return impl_->GetFileInfo(select);
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/hdfs.h
Expand Up @@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
std::string type_name() const override { return "hdfs"; }
HdfsOptions options() const;
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;

/// \cond FALSE
using FileSystem::GetFileInfo;
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/filesystem/hdfs_test.cc
Expand Up @@ -119,6 +119,8 @@ class TestHadoopFileSystem : public ::testing::Test, public HadoopFileSystemTest
ARROW_LOG(INFO) << "!!! uri = " << ss.str();
ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path));
ASSERT_EQ(path, "/");
ASSERT_OK_AND_ASSIGN(path, uri_fs->PathFromUri(ss.str()));
ASSERT_EQ(path, "/");

// Sanity check
ASSERT_OK(uri_fs->CreateDir("AB"));
Expand Down
57 changes: 21 additions & 36 deletions cpp/src/arrow/filesystem/localfs.cc
Expand Up @@ -52,37 +52,6 @@ using ::arrow::internal::IOErrorFromWinError;
using ::arrow::internal::NativePathString;
using ::arrow::internal::PlatformFilename;

namespace internal {

#ifdef _WIN32
static bool IsDriveLetter(char c) {
// Can't use locale-dependent functions from the C/C++ stdlib
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
#endif

bool DetectAbsolutePath(const std::string& s) {
// Is it a /-prefixed local path?
if (s.length() >= 1 && s[0] == '/') {
return true;
}
#ifdef _WIN32
// Is it a \-prefixed local path?
if (s.length() >= 1 && s[0] == '\\') {
return true;
}
// Does it start with a drive letter in addition to being /- or \-prefixed,
// e.g. "C:\..."?
if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') &&
IsDriveLetter(s[0])) {
return true;
}
#endif
return false;
}

} // namespace internal

namespace {

Status ValidatePath(std::string_view s) {
Expand All @@ -92,6 +61,12 @@ Status ValidatePath(std::string_view s) {
return Status::OK();
}

Result<std::string> DoNormalizePath(std::string path) {
RETURN_NOT_OK(ValidatePath(path));
ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
return fn.ToString();
}

#ifdef _WIN32

std::string NativeToString(const NativePathString& ns) {
Expand Down Expand Up @@ -263,13 +238,15 @@ Result<LocalFileSystemOptions> LocalFileSystemOptions::FromUri(
#ifdef _WIN32
std::stringstream ss;
ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path());
*out_path = ss.str();
*out_path =
std::string(internal::RemoveTrailingSlash(ss.str(), /*preserve_root=*/true));
#else
return Status::Invalid("Unsupported hostname in non-Windows local URI: '",
uri.ToString(), "'");
#endif
} else {
*out_path = uri.path();
*out_path =
std::string(internal::RemoveTrailingSlash(uri.path(), /*preserve_root=*/true));
}

// TODO handle use_mmap option
Expand All @@ -286,9 +263,17 @@ LocalFileSystem::LocalFileSystem(const LocalFileSystemOptions& options,
LocalFileSystem::~LocalFileSystem() {}

Result<std::string> LocalFileSystem::NormalizePath(std::string path) {
RETURN_NOT_OK(ValidatePath(path));
ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
return fn.ToString();
return DoNormalizePath(std::move(path));
}

Result<std::string> LocalFileSystem::PathFromUri(const std::string& uri_string) const {
#ifdef _WIN32
auto authority_handling = internal::AuthorityHandlingBehavior::kWindows;
#else
auto authority_handling = internal::AuthorityHandlingBehavior::kDisallow;
#endif
return internal::PathFromUriHelper(uri_string, {"file"}, /*accept_local_paths=*/true,
authority_handling);
}

bool LocalFileSystem::Equals(const FileSystem& other) const {
Expand Down
9 changes: 1 addition & 8 deletions cpp/src/arrow/filesystem/localfs.h
Expand Up @@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
std::string type_name() const override { return "local"; }

Result<std::string> NormalizePath(std::string path) override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;

bool Equals(const FileSystem& other) const override;

Expand Down Expand Up @@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
LocalFileSystemOptions options_;
};

namespace internal {

// Return whether the string is detected as a local absolute path.
ARROW_EXPORT
bool DetectAbsolutePath(const std::string& s);

} // namespace internal

} // namespace fs
} // namespace arrow

0 comments on commit 169db77

Please sign in to comment.