Skip to content

Commit

Permalink
Use standard C++
Browse files Browse the repository at this point in the history
  • Loading branch information
pierotofy committed Jan 3, 2024
1 parent 64508ed commit 2094634
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 95 deletions.
107 changes: 23 additions & 84 deletions dedup.cpp
Original file line number Diff line number Diff line change
@@ -1,106 +1,45 @@
#include "dedup.hpp"

std::tuple<std::string, std::string, size_t> dedup(const std::string &src, const std::string &tgt){
#if defined(_WIN32) || defined(_WIN64)
HANDLE file_handle = CreateFile(src.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file_handle == INVALID_HANDLE_VALUE) {
throw std::runtime_error("Unable to open " + src);
}

DWORD file_size = GetFileSize(file_handle, NULL);
HANDLE file_mapping = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
if (file_mapping == NULL) {
CloseHandle(file_handle);
throw std::runtime_error("File mapping failed.");
}

const char* file_data = static_cast<const char*>(MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, file_size));
if (file_data == NULL) {
CloseHandle(file_mapping);
CloseHandle(file_handle);
throw std::runtime_error("Memory mapping failed.");
}
#else
int fd = open(src.c_str(), O_RDONLY);
if (fd == -1) throw std::runtime_error("Unable to open " + src);

struct stat file_stat;
if (fstat(fd, &file_stat) == -1) {
close(fd);
throw std::runtime_error("Unable to get file size for " + src);
}

off_t file_size = file_stat.st_size;
const char* file_data = static_cast<const char*>(mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0));
if (file_data == MAP_FAILED) {
close(fd);
throw std::runtime_error("Memory mapping failed.");
}

#endif

std::unordered_multimap<std::uint32_t, std::string_view> lines;

std::ifstream tgt_is(tgt);
if (!tgt_is.is_open()) throw std::runtime_error("Cannot open " + tgt);
std::ifstream src_if(src);
if (!src_if.is_open()) throw std::runtime_error("Cannot open " + src);

std::ifstream tgt_if(tgt);
if (!tgt_if.is_open()) throw std::runtime_error("Cannot open " + tgt);

std::unordered_map<std::uint32_t, bool> seen;

std::string src_out = src + ".dedup";
std::string tgt_out = tgt + ".dedup";

std::ofstream src_of(src_out, std::ios::trunc);
std::ofstream tgt_of(tgt_out, std::ios::trunc);

const char* line_start = file_data;
const char* line_end = file_data;
std::string line_buf = "";
std::string line_s = "";
std::string line_t = "";
size_t removed = 0;

for (const char* ptr = file_data; ptr < file_data + file_size; ++ptr) {
if (*ptr == '\n' || ptr == file_data + file_size - 1) {
std::getline(tgt_is, line_buf);
while(!src_if.eof()) {
std::getline(src_if, line_s);
std::getline(tgt_if, line_t);

line_end = ptr + 1;
std::string_view line_view(line_start, line_end - line_start);
std::uint32_t hash = xxh32::hash(line_s.c_str(), static_cast<uint32_t>(line_s.size()), 0);


std::uint32_t hash = xxh32::hash(line_start, static_cast<uint32_t>(line_end - line_start), 0);
auto range = lines.equal_range(hash);

bool line_exists = false;
for (auto it = range.first; it != range.second; ++it) {
if (it->second == line_view) {
line_exists = true;
break;
}
}

if (!line_exists) {
lines.emplace(hash, line_view);
if (*(ptr - 1) == '\r'){
std::string_view lv(line_start, line_end - line_start - 2);
src_of << lv << "\n";
}else{
src_of << line_view;
}
tgt_of << line_buf << (*ptr == '\n' ? "\n" : "");
}else{
removed++;
}

line_start = line_end;
bool line_exists = seen.count(hash) > 0;
if (!line_exists) {
seen[hash] = true;
src_of << line_s << "\n";
tgt_of << line_t << "\n";
}else{
removed++;
}
}

#if defined(_WIN32) || defined(_WIN64)
UnmapViewOfFile(file_data);
CloseHandle(file_mapping);
CloseHandle(file_handle);
#else
munmap(const_cast<char*>(file_data), file_stat.st_size);
close(fd);
#endif

src_of.close();
tgt_of.close();
src_if.close();
tgt_if.close();

return std::make_tuple(src_out, tgt_out, removed);
}
11 changes: 0 additions & 11 deletions dedup.hpp
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
#include <iostream>
#include <fstream>
#include <unordered_map>
#include <string_view>
#include <cstring>
#include <tuple>
#include <variant>
#include <string>

#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#else
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#endif

#include "vendor/xxh32.hpp"

std::tuple<std::string, std::string, size_t> dedup(const std::string &src, const std::string &tgt);

0 comments on commit 2094634

Please sign in to comment.