-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
23 additions
and
95 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,106 +1,45 @@ | ||
#include "dedup.hpp" | ||
|
||
std::tuple<std::string, std::string, size_t> dedup(const std::string &src, const std::string &tgt){ | ||
#if defined(_WIN32) || defined(_WIN64) | ||
HANDLE file_handle = CreateFile(src.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); | ||
if (file_handle == INVALID_HANDLE_VALUE) { | ||
throw std::runtime_error("Unable to open " + src); | ||
} | ||
|
||
DWORD file_size = GetFileSize(file_handle, NULL); | ||
HANDLE file_mapping = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL); | ||
if (file_mapping == NULL) { | ||
CloseHandle(file_handle); | ||
throw std::runtime_error("File mapping failed."); | ||
} | ||
|
||
const char* file_data = static_cast<const char*>(MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, file_size)); | ||
if (file_data == NULL) { | ||
CloseHandle(file_mapping); | ||
CloseHandle(file_handle); | ||
throw std::runtime_error("Memory mapping failed."); | ||
} | ||
#else | ||
int fd = open(src.c_str(), O_RDONLY); | ||
if (fd == -1) throw std::runtime_error("Unable to open " + src); | ||
|
||
struct stat file_stat; | ||
if (fstat(fd, &file_stat) == -1) { | ||
close(fd); | ||
throw std::runtime_error("Unable to get file size for " + src); | ||
} | ||
|
||
off_t file_size = file_stat.st_size; | ||
const char* file_data = static_cast<const char*>(mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0)); | ||
if (file_data == MAP_FAILED) { | ||
close(fd); | ||
throw std::runtime_error("Memory mapping failed."); | ||
} | ||
|
||
#endif | ||
|
||
std::unordered_multimap<std::uint32_t, std::string_view> lines; | ||
|
||
std::ifstream tgt_is(tgt); | ||
if (!tgt_is.is_open()) throw std::runtime_error("Cannot open " + tgt); | ||
std::ifstream src_if(src); | ||
if (!src_if.is_open()) throw std::runtime_error("Cannot open " + src); | ||
|
||
std::ifstream tgt_if(tgt); | ||
if (!tgt_if.is_open()) throw std::runtime_error("Cannot open " + tgt); | ||
|
||
std::unordered_map<std::uint32_t, bool> seen; | ||
|
||
std::string src_out = src + ".dedup"; | ||
std::string tgt_out = tgt + ".dedup"; | ||
|
||
std::ofstream src_of(src_out, std::ios::trunc); | ||
std::ofstream tgt_of(tgt_out, std::ios::trunc); | ||
|
||
const char* line_start = file_data; | ||
const char* line_end = file_data; | ||
std::string line_buf = ""; | ||
std::string line_s = ""; | ||
std::string line_t = ""; | ||
size_t removed = 0; | ||
|
||
for (const char* ptr = file_data; ptr < file_data + file_size; ++ptr) { | ||
if (*ptr == '\n' || ptr == file_data + file_size - 1) { | ||
std::getline(tgt_is, line_buf); | ||
while(!src_if.eof()) { | ||
std::getline(src_if, line_s); | ||
std::getline(tgt_if, line_t); | ||
|
||
line_end = ptr + 1; | ||
std::string_view line_view(line_start, line_end - line_start); | ||
std::uint32_t hash = xxh32::hash(line_s.c_str(), static_cast<uint32_t>(line_s.size()), 0); | ||
|
||
|
||
std::uint32_t hash = xxh32::hash(line_start, static_cast<uint32_t>(line_end - line_start), 0); | ||
auto range = lines.equal_range(hash); | ||
|
||
bool line_exists = false; | ||
for (auto it = range.first; it != range.second; ++it) { | ||
if (it->second == line_view) { | ||
line_exists = true; | ||
break; | ||
} | ||
} | ||
|
||
if (!line_exists) { | ||
lines.emplace(hash, line_view); | ||
if (*(ptr - 1) == '\r'){ | ||
std::string_view lv(line_start, line_end - line_start - 2); | ||
src_of << lv << "\n"; | ||
}else{ | ||
src_of << line_view; | ||
} | ||
tgt_of << line_buf << (*ptr == '\n' ? "\n" : ""); | ||
}else{ | ||
removed++; | ||
} | ||
|
||
line_start = line_end; | ||
bool line_exists = seen.count(hash) > 0; | ||
if (!line_exists) { | ||
seen[hash] = true; | ||
src_of << line_s << "\n"; | ||
tgt_of << line_t << "\n"; | ||
}else{ | ||
removed++; | ||
} | ||
} | ||
|
||
#if defined(_WIN32) || defined(_WIN64) | ||
UnmapViewOfFile(file_data); | ||
CloseHandle(file_mapping); | ||
CloseHandle(file_handle); | ||
#else | ||
munmap(const_cast<char*>(file_data), file_stat.st_size); | ||
close(fd); | ||
#endif | ||
|
||
src_of.close(); | ||
tgt_of.close(); | ||
src_if.close(); | ||
tgt_if.close(); | ||
|
||
return std::make_tuple(src_out, tgt_out, removed); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,10 @@ | ||
#include <iostream> | ||
#include <fstream> | ||
#include <unordered_map> | ||
#include <string_view> | ||
#include <cstring> | ||
#include <tuple> | ||
#include <variant> | ||
#include <string> | ||
|
||
#if defined(_WIN32) || defined(_WIN64) | ||
#include <windows.h> | ||
#else | ||
#include <sys/mman.h> | ||
#include <sys/stat.h> | ||
#include <fcntl.h> | ||
#include <unistd.h> | ||
#endif | ||
|
||
#include "vendor/xxh32.hpp" | ||
|
||
std::tuple<std::string, std::string, size_t> dedup(const std::string &src, const std::string &tgt); |