# LZW Compression

## Encoder

input : base symbol table (dictionary) , sequence of symbols.

output: sequence of symbols indices in symbol table

In [65]:
import numpy as np
import pandas as pd

In [66]:
class LZW:
    def __init__(self, symbol_table):
        self.symbol_table = symbol_table
        self.symbol_df = self.make_df(self.symbol_table)
        
    def get_dict(self):
        return self.symbol_table

    def get_df(self):
        return self.symbol_df
    
    def make_df(self,symbol_table):
        arr = [[k,v] for k, v in symbol_table.items()]
        arr = np.array(arr)
        column_names = ['Symbol', 'Code']
        df = pd.DataFrame(arr, columns= column_names)
        return df
    
    def encode(self,symbols_sequence):
        count = 0
        output = []
        previous = ""
        # "aaaaaabbb"
        for i in range(len(symbols_sequence)):
            current = symbols_sequence[i]
            seq = previous + current
            if seq in self.symbol_table:
                previous = seq
            else:
                output.append(self.symbol_table[previous])
                self.symbol_table[seq] = 256 + count
                previous = current
                count += 1
        return output
    
    def decode(self, code_sequence):
        count = 0
        output = []
        previous_symbol = ""
        found = False
        for i in range(len(code_sequence)):
            current_code = code_sequence.pop(0)

            for symbol, code in self.symbol_table.items():
                found = False
                if current_code == code:
                    output.append(symbol)
                    if previous_symbol != "":
                        new_seq = previous_symbol + symbol[0]
                        self.symbol_table[new_seq] = 256 + count
                        count += 1
                    previous_symbol = symbol
                    found = True
                    break

            if found == False:
                # duplicates
                new_seq = previous_symbol + previous_symbol[-1]
                output.append(new_seq)
                self.symbol_table[new_seq] = 256 + count
                previous_symbol = new_seq
                count += 1
                
            
        return output


    


### Usage

In [67]:
# init_dict = {'a': 0,
#              'b': 1,
#              'c': 2,
#              }


# init_dict = {'M': 0,
#              'U': 1,
#              'L': 2,
#              'I': 3,
#              }

init_dict = {'a': 97,
             'b': 98
             }

In [68]:
lzw = LZW(init_dict)
lzw.get_df()

Unnamed: 0,Symbol,Code
0,a,97
1,b,98


In [69]:
code_seq = [97, 256, 257, 256, 98, 260, 261]
lzw.decode(code_seq)


['a', 'aa', 'aaa', 'aa', 'b', 'bb', 'bbb']

In [19]:
x = "aaaaaabbbbaaabc"

code_seq = [0, 1, 256, 257, 3, 258]
lzw.decode(code_seq)
#lzw.encode(x)

['ab']