In [43]:
import numpy as np
import pandas as pd

In [44]:
def init_ranges(data,cumulative):
    ranges = []
    for i in range(len(data)):
        symbol_range = []
        symbol_range.append(data[i][0])
        if i > 0:
            min_range = cumulative[i-1]
        else:
            min_range = 0

        max_range = cumulative[i]
        symbol_range.append(min_range)
        symbol_range.append(max_range)
        ranges.append(symbol_range)
    return ranges



In [45]:
def show_ranges(ranges):
    ranges_df = pd.DataFrame(ranges,columns=['Symbol','start range', 'end range'])
    print("------------ RANGES ------------")
    print(ranges_df)
    


In [46]:
def update_ranges(old_ranges,new_start, new_end,DECIMALS=5):
    #new val(C(s)) = new_start + C(s) * delta
    delta = float(new_end) - float(new_start)
    ranges = np.copy(old_ranges)
    for x in ranges:
        # symbol = x[0]
        old_min = np.round(float(x[1]),decimals= DECIMALS)
        old_max = np.round(float(x[2]),decimals= DECIMALS)

        new_min = float(new_start) + float(old_min) * delta
        new_min = np.round(new_min, decimals= DECIMALS)
        new_max = float(new_start) + float(old_max) * delta
        new_max = np.round(new_max, decimals= DECIMALS)

        x[1] = new_min
        x[2] = new_max

    return ranges

In [47]:
def encode_arithmetic(file, ranges, DECIMALS = 5):
    current_min_code = -1
    current_max_code = -1
    
    init_ranges = np.copy(ranges)
    current_ranges = init_ranges
    for char in file:
        for x in current_ranges:
            symbol = x[0]
            min_range = np.round(float(x[1]),decimals= DECIMALS)
            max_range = np.round(float(x[2]),decimals= DECIMALS)
            if symbol == char:
                print(f"symbol: {symbol} \nmin= {min_range}\nmax = {max_range}")
                current_min_code = min_range
                current_max_code = max_range
                current_ranges = update_ranges(init_ranges, min_range, max_range)
                show_ranges(current_ranges)
                print("==============================================")
    print("END")
    return current_min_code, current_max_code


In [48]:
def arithmetic_to_binary(num, min_range, max_range):
    binary_string = "0."
    x = 0
    counter = -1
    while(True):
        num = num * 2
        int_part = int(num)
        num -= int_part
        x += int_part * 2**(counter)
        binary_string += str(int_part)
        if (x > min_range and x < max_range):
            break
        counter -= 1
        print(x)
    return binary_string

### Usage

In [49]:
# data = [(symbol, count), ... ()]

#data = [('A',100), ('B', 100), ('C',100), ('D',500), ('E',200), ('F',100), ('G',50), ('H',50)]
#data = [('A',0.2), ('B', 0.3), ('C',0.25), ('D',0.25)]
#data = [('A',0.05), ('B', 0.1), ('C',0.1), ('D',0.25), ('E',0.5)]

data = [('A',100), ('B', 100), ('C',100), ('D',800), ('E',200) , ('F',100), ('G',50), ('H',50) ]



# 

In [50]:
data.sort(key=lambda a: a[1])
print(data)

[('G', 50), ('H', 50), ('A', 100), ('B', 100), ('C', 100), ('F', 100), ('E', 200), ('D', 800)]


In [51]:
freq = [count for symbol,count in data]
prob = freq / np.sum(freq)
cumulative = np.cumsum(prob)
#cumulative = [0.4, 0.7, 1]


In [52]:
init_r = init_ranges(data,cumulative)
show_ranges(init_r)

------------ RANGES ------------
  Symbol  start range  end range
0      G     0.000000   0.033333
1      H     0.033333   0.066667
2      A     0.066667   0.133333
3      B     0.133333   0.200000
4      C     0.200000   0.266667
5      F     0.266667   0.333333
6      E     0.333333   0.466667
7      D     0.466667   1.000000


In [53]:
file = "DFG"
min_code, max_code = encode_arithmetic(file,init_r,DECIMALS= 3)

symbol: D 
min= 0.467
max = 1.0
------------ RANGES ------------
  Symbol start range end range
0      G       0.467   0.48476
1      H     0.48476   0.50254
2      A     0.50254   0.53806
3      B     0.53806    0.5736
4      C      0.5736   0.60914
5      F     0.60914   0.64466
6      E     0.64466   0.71574
7      D     0.71574       1.0
symbol: F 
min= 0.609
max = 0.645
------------ RANGES ------------
  Symbol start range end range
0      G       0.609    0.6102
1      H      0.6102    0.6114
2      A      0.6114    0.6138
3      B      0.6138    0.6162
4      C      0.6162    0.6186
5      F      0.6186     0.621
6      E       0.621    0.6258
7      D      0.6258     0.645
symbol: G 
min= 0.609
max = 0.61
------------ RANGES ------------
  Symbol start range end range
0      G       0.609   0.60903
1      H     0.60903   0.60907
2      A     0.60907   0.60913
3      B     0.60913    0.6092
4      C      0.6092   0.60927
5      F     0.60927   0.60933
6      E     0.60933   0.60

In [54]:
print(f"Arithmetic code for \"{file}\": \n\tminimum: {min_code}, \n\tmaximum: {max_code}")
code = (min_code + max_code) / 2
print(f"Code (average) = {code}")

Arithmetic code for "DFG": 
	minimum: 0.609, 
	maximum: 0.61
Code (average) = 0.6094999999999999


### converting Float to binary code

In [55]:
binary_code = arithmetic_to_binary(code, min_code, max_code)
print(f"binary code = {binary_code}")

0.5
0.5
0.5
0.5625
0.59375
binary code = 0.100111


## Decoding

In [56]:
binary_code = '0.101100'

In [57]:
str_code = binary_code.split('.')[1]
list(str_code)
str_code

'101100'

In [58]:
#works only on fractions
def bincode_to_decimal(code):
    str_code = binary_code.split('.')[1]
    number=0
    power = -1
    for digit in str_code:
        if int(digit) == 1:
            number += 2** power
        power -= 1
    return number
decimal_code = bincode_to_decimal(str_code)
print(f"number in decimals = {decimal_code}")

number in decimals = 0.6875


In [73]:
def decode_arithmetic(code,ranges):
    init_ranges = np.copy(ranges)
    current_ranges = init_ranges

    decoded_string = ""
    SEQUENCE_LENGTH = 3
    COUNT = 0
    while True:
        for x in current_ranges:
            symbol = x[0]
            min_range = float(x[1])
            max_range = float(x[2])
            if  ((code >= min_range) and (code < max_range)):
                COUNT += 1
                print(f"decoded_symbol: {symbol} \nmin= {min_range}\nmax = {max_range}")
                decoded_string += symbol
                current_ranges = update_ranges(init_ranges, min_range, max_range)
                show_ranges(current_ranges)

                print("==============================================")
                if np.round(((min_range + max_range) / 2), decimals= 5) == np.round(code,decimals=5):
                    print("END")
                    return decoded_string
            if COUNT == SEQUENCE_LENGTH:
                return decoded_string



In [74]:
decoded_string = decode_arithmetic(decimal_code,init_r)
print("==============================================")

print(f"Decoded String : {decoded_string}")

decoded_symbol: D 
min= 0.4666666666666667
max = 1.0
------------ RANGES ------------
  Symbol start range end range
0      G     0.46667   0.48444
1      H     0.48444   0.50222
2      A     0.50222   0.53778
3      B     0.53778   0.57333
4      C     0.57333   0.60889
5      F     0.60889   0.64444
6      E     0.64444   0.71556
7      D     0.71556       1.0
decoded_symbol: E 
min= 0.64444
max = 0.71556
------------ RANGES ------------
  Symbol start range end range
0      G     0.64444   0.64681
1      H     0.64681   0.64918
2      A     0.64918   0.65392
3      B     0.65392   0.65866
4      C     0.65866   0.66341
5      F     0.66341   0.66815
6      E     0.66815   0.67763
7      D     0.67763   0.71556
decoded_symbol: D 
min= 0.67763
max = 0.71556
------------ RANGES ------------
  Symbol start range end range
0      G     0.67763   0.67889
1      H     0.67889   0.68016
2      A     0.68016   0.68269
3      B     0.68269   0.68522
4      C     0.68522   0.68774
5      F    

### To DO

handling Forced Exit problem in Decoding