In [10]:
! pip install PyPDF2 python-docx

import os
from decimal import Decimal, getcontext
from PyPDF2 import PdfReader
from docx import Document

# Set high precision for arithmetic calculations
getcontext().prec = 50

def calculate_symbol_probs(text):
#prob of each symbol by frequency
    frequency = {}
    for char in text:
        frequency[char] = frequency.get(char, 0) + 1

    total = len(text)
    probabilities = {char: freq / total for char, freq in frequency.items()}
    return probabilities

def get_cumulative_prob(symbol_probs):
    cumulative_prob = {}
    cumulative = Decimal(0.0)

    for char, prob in sorted(symbol_probs.items()):
        cumulative_prob[char] = (cumulative, cumulative + Decimal(prob))
        cumulative += Decimal(prob)

    return cumulative_prob

def encode(text, symbol_probs):
#arithematic encoding
    low = Decimal(0.0)
    high = Decimal(1.0)
    cumulative_prob = get_cumulative_prob(symbol_probs)

    for char in text:
        range_ = high - low
        high = low + range_ * cumulative_prob[char][1]
        low = low + range_ * cumulative_prob[char][0]

    return (low + high) / 2

def decode(encoded_value, length, symbol_probs):
    #arithematic decoding
    cumulative_prob = get_cumulative_prob(symbol_probs)
    decoded_text = ""

    for _ in range(length):
        for char, (low, high) in cumulative_prob.items():
            if low <= encoded_value < high:
                decoded_text += char
                encoded_value = (encoded_value - low) / (high - low)
                break

    return decoded_text

def extract_text_from_pdf(file_path):
    #Extract text from a PDF file.
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_text_from_docx(file_path):
    #Extract text from a Word file.
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text


file_path = "test.txt" 
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == ".pdf":
    text = extract_text_from_pdf(file_path)
elif file_ext == ".docx":
    text = extract_text_from_docx(file_path)
else:
    with open(file_path, "r") as f:
        text = f.read()

print(f"Text extracted from file: {text.strip()}")

symbol_probs = calculate_symbol_probs(text)
encoded_value = encode(text, symbol_probs)
print(f"Compressed Value: {encoded_value}")
decompressed_text = decode(encoded_value, len(text), symbol_probs)
print(f"Decompressed Text: {decompressed_text.strip()}")


Text extracted from file: Who is the best team ? Ours of course
Compressed Value: 0.28507222833138890889960731294562520554577629747354
Decompressed Text: Who is the best team ? Ours of course
