Skip to content

Commit

Permalink
Merge 132a9e2 into d9b357f
Browse files Browse the repository at this point in the history
  • Loading branch information
Hendrik Muhs committed Dec 29, 2017
2 parents d9b357f + 132a9e2 commit 25741fb
Show file tree
Hide file tree
Showing 16 changed files with 653 additions and 703 deletions.
61 changes: 27 additions & 34 deletions keyvi/include/keyvi/compression/predictive_compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,17 @@
* Author: hendrik
*/

#ifndef PREDICTIVE_COMPRESSION_H_
#define PREDICTIVE_COMPRESSION_H_
#ifndef KEYVI_COMPRESSION_PREDICTIVE_COMPRESSION_H_
#define KEYVI_COMPRESSION_PREDICTIVE_COMPRESSION_H_

#include <inttypes.h>
#include <bitset>
#include <sstream>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>

//#define ENABLE_TRACING
// #define ENABLE_TRACING
#include "dictionary/util/trace.h"

namespace keyvi {
Expand All @@ -40,22 +42,18 @@ namespace compression {
* Short string compression inspired by RFC 1978 (Predictor Compression Protocol)
*/
class PredictiveCompression final {

public:
PredictiveCompression(std::string file_name) {
explicit PredictiveCompression(std::string file_name) {
std::fstream infile(file_name, std::fstream::in | std::fstream::binary);
if (!infile.is_open()) throw std::invalid_argument("cannot read file");
read_stream(infile);
infile.close();
}

PredictiveCompression(std::istream& instream) {
read_stream(instream);
}
explicit PredictiveCompression(std::istream& instream) { read_stream(instream); }

// highly inefficient for now
std::string LookupBigram(const unsigned char* bigram){

std::string LookupBigram(const unsigned char* bigram) {
// skip for null bytes
if (bigram[0] == 0 || bigram[1] == 0) {
return "";
Expand All @@ -64,7 +62,7 @@ class PredictiveCompression final {
}
}

std::string Compress(const std::string& input){
std::string Compress(const std::string& input) {
std::ostringstream output_buffer;

size_t input_length = input.size();
Expand All @@ -73,7 +71,7 @@ class PredictiveCompression final {
char uncompressed_buf[8];
unsigned char bigram[2];
size_t bitset_position = 0;
size_t uncompressed_position = 0;
size_t uncompressed_position = 0;

TRACE("Compressing %s", input.c_str());

Expand All @@ -90,12 +88,11 @@ class PredictiveCompression final {
uncompressed_position = 2;

offset = 2;
while ( offset < input_length ) {

while (offset < input_length) {
std::string prediction = LookupBigram(bigram);
if (prediction.size() > 0 && prediction == input.substr(offset, prediction.size())) {
// prediction succeeded
TRACE ("Prediction success: %s", prediction.c_str());
TRACE("Prediction success: %s", prediction.c_str());
current_bitset.set(bitset_position++);
offset += prediction.size();

Expand All @@ -109,7 +106,7 @@ class PredictiveCompression final {

} else {
// prediction failed
TRACE ("Prediction failed: %s", prediction.c_str());
TRACE("Prediction failed: %s", prediction.c_str());

current_bitset.set(bitset_position++, 0);
uncompressed_buf[uncompressed_position++] = input[offset];
Expand All @@ -123,9 +120,9 @@ class PredictiveCompression final {
TRACE("Flush bitset:%s data: '%s' ", current_bitset.to_string().c_str(),
std::string(uncompressed_buf, uncompressed_position).c_str());

output_buffer.put(static_cast<unsigned char>( current_bitset.to_ulong() ));
output_buffer.put(static_cast<unsigned char>(current_bitset.to_ulong()));
output_buffer.write(uncompressed_buf, uncompressed_position);
bitset_position =0;
bitset_position = 0;
current_bitset.reset();
uncompressed_position = 0;
}
Expand All @@ -134,9 +131,9 @@ class PredictiveCompression final {
// write remainder
if (bitset_position != 0) {
TRACE("Last chunk bitset:%s data: '%s' ", current_bitset.to_string().c_str(),
std::string(uncompressed_buf, uncompressed_position).c_str());
std::string(uncompressed_buf, uncompressed_position).c_str());

output_buffer.put(static_cast<unsigned char>( current_bitset.to_ulong() ));
output_buffer.put(static_cast<unsigned char>(current_bitset.to_ulong()));
output_buffer.write(uncompressed_buf, uncompressed_position);
}

Expand All @@ -145,7 +142,7 @@ class PredictiveCompression final {
return output_buffer.str();
}

std::string Uncompress(const std::string & input) {
std::string Uncompress(const std::string& input) {
if (input.size() < 2) {
return input;
}
Expand All @@ -155,21 +152,21 @@ class PredictiveCompression final {
size_t input_length = input.size();
size_t offset = 0;
std::bitset<8> current_bitset;
//char uncompressed_buf[8];
// char uncompressed_buf[8];
unsigned char bigram[2];
size_t bitset_position = 0;

// setup first character
bigram[0] = input[1];
bigram[1] = input[2];
current_bitset = static_cast<unsigned long> (input[0]);
current_bitset = static_cast<std::bitset<8>>(input[0]);
bitset_position = 2;
output_buffer.put(input[1]);
output_buffer.put(input[2]);

offset = 3;

while ( offset < input_length ) {
while (offset < input_length) {
if (current_bitset[bitset_position++]) {
TRACE("Bit is set, do lookup ");
std::string prediction = LookupBigram(bigram);
Expand All @@ -192,7 +189,7 @@ class PredictiveCompression final {
if (bitset_position == 8) {
TRACE("Read next chunk");

current_bitset = static_cast<unsigned long> (input[offset++]);
current_bitset = static_cast<std::bitset<8>>(input[offset++]);
bitset_position = 0;
}
}
Expand All @@ -217,23 +214,19 @@ class PredictiveCompression final {
uint8_t length = instream.get();
if (length > 8) {
char error[100];
sprintf(error, "Invalid model: too long value (%u) for key %02x:%02x",
length, index >> 8, index & 0xFF);
snprintf(error, sizeof(error), "Invalid model: too long value (%u) for key %02x:%02x", length, index >> 8,
index & 0xFF);
throw std::invalid_argument(error);
}
if (!instream.read(buffer, length))
throw std::istream::failure("Incomplete model stream.");
if (!instream.read(buffer, length)) throw std::istream::failure("Incomplete model stream.");
predictor_table_[index] = std::string(buffer, length);
}
}

std::string predictor_table_[65536];
};


} /* namespace compression */
} /* namespace keyvi */



#endif /* PREDICTIVE_COMPRESSION_H_ */
#endif // KEYVI_COMPRESSION_PREDICTIVE_COMPRESSION_H_
157 changes: 0 additions & 157 deletions keyvi/include/keyvi/dictionary/compilation/compilation_utils.h

This file was deleted.

Loading

0 comments on commit 25741fb

Please sign in to comment.