In [4]:
## TO EXTRACT FEATURES AND CREATE DATASET FOR THE MODEL
import math, os, csv
from collections import Counter

# ---------- helper feature functions ----------
def calculate_entropy(text):
    if not text: return 0
    freq = Counter(text)
    total = len(text)
    probs = [f / total for f in freq.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)

def unique_symbol_count(text):
    return len(set(text)) if text else 0

def mean_run_length(text):
    if not text: return 0
    runs, run = [], 1
    for i in range(1, len(text)):
        if text[i] == text[i - 1]:
            run += 1
        else:
            runs.append(run)
            run = 1
    runs.append(run)
    return sum(runs) / len(runs)

def max_run_length(text):
    if not text: return 0
    max_run, run = 1, 1
    for i in range(1, len(text)):
        if text[i] == text[i - 1]:
            run += 1
            max_run = max(max_run, run)
        else:
            run = 1
    return max_run

def symbol_frequency_variance(text):
    if not text: return 0
    freq = Counter(text)
    values = list(freq.values())
    mean_val = sum(values) / len(values)
    return sum((v - mean_val) ** 2 for v in values) / len(values)

def bigram_repeat_rate(text):
    if len(text) < 2: return 0
    bigrams = [text[i:i+2] for i in range(len(text) - 1)]
    freq = Counter(bigrams)
    repeats = sum(1 for v in freq.values() if v > 1)
    return repeats / len(freq)

def trigram_repeat_rate(text):
    if len(text) < 3: return 0
    trigrams = [text[i:i+3] for i in range(len(text) - 2)]
    freq = Counter(trigrams)
    repeats = sum(1 for v in freq.values() if v > 1)
    return repeats / len(freq)

def character_repetition_ratio(text):
    if not text: return 0
    repeated = sum(1 for i in range(1, len(text)) if text[i] == text[i - 1])
    return repeated / len(text)

def avg_word_length(text):
    words = [w for w in text.split() if w]
    if not words: return 0
    return sum(len(w) for w in words) / len(words)

def whitespace_ratio(text):
    if not text: return 0
    whites = sum(1 for c in text if c.isspace())
    return whites / len(text)

# ---------- feature extraction for one file ----------
def extract_features(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    if not text:
        return None

    feats = {
        "File": os.path.basename(file_path),
        "Entropy": calculate_entropy(text),
        "UniqueSymbols": unique_symbol_count(text),
        "MeanRunLength": mean_run_length(text),
        "MaxRunLength": max_run_length(text),
        "SymbolFreqVariance": symbol_frequency_variance(text),
        "BigramRepeatRate": bigram_repeat_rate(text),
        "TrigramRepeatRate": trigram_repeat_rate(text),
        "CharacterRepetitionRatio": character_repetition_ratio(text),
        "AvgWordLength": avg_word_length(text),
        "WhitespaceRatio": whitespace_ratio(text),
        "FileSize": os.path.getsize(file_path)
    }
    return feats

# ---------- main folder ‚Üí CSV ----------
def extract_folder_to_csv(folder_path, output_csv):
    print("üîç Extracting features from folder...")

    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    all_features = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            print(f"üìÑ Processing: {filename}")
            feats = extract_features(file_path)
            if feats:
                all_features.append(feats)

    if not all_features:
        print("‚ö†Ô∏è No valid files found in folder.")
        return

    columns = [
        "File", "Entropy", "UniqueSymbols", "MeanRunLength", "MaxRunLength",
        "SymbolFreqVariance", "BigramRepeatRate", "TrigramRepeatRate",
        "CharacterRepetitionRatio", "AvgWordLength", "WhitespaceRatio", "FileSize"
    ]

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writeheader()
        writer.writerows(all_features)

    print(f"\n‚úÖ Feature extraction complete.\nSaved to: {output_csv}\nTotal files processed: {len(all_features)}")

# ---------- run ----------
if __name__ == "__main__":
    input_folder = r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files"
    output_csv = r"C:\Users\mhars\Desktop\NITK Projects\IPC\Extracted features from files\features.csv"

    print(f"üìÅ Reading files from: {input_folder}")
    extract_folder_to_csv(input_folder, output_csv)


üìÅ Reading files from: C:\Users\mhars\Desktop\NITK Projects\IPC\Files
üîç Extracting features from folder...
üìÑ Processing: alice_in_wonderland.txt
üìÑ Processing: Bumps and His Buddies.txt
üìÑ Processing: class.py
üìÑ Processing: classtest2.3.py
üìÑ Processing: combined_data.csv


KeyboardInterrupt: 

In [9]:
import os
import time
import pandas as pd
from collections import Counter
import heapq

# ------------------------------
# SIMPLE RLE
# ------------------------------
def rle_compress(data: bytes) -> bytes:
    compressed = bytearray()
    i = 0
    while i < len(data):
        count = 1
        while i + 1 < len(data) and data[i] == data[i+1] and count < 255:
            i += 1
            count += 1
        compressed.extend([count, data[i]])
        i += 1
    return bytes(compressed)

def rle_decompress(data: bytes) -> bytes:
    decompressed = bytearray()
    i = 0
    while i < len(data):
        count = data[i]
        value = data[i+1]
        decompressed.extend([value] * count)
        i += 2
    return bytes(decompressed)

# ------------------------------
# FIXED LZW (NO OVERFLOW)
# ------------------------------
def lzw_compress(data: bytes) -> bytes:
    dictionary = {bytes([i]): i for i in range(256)}
    w = b""
    result = []
    code = 256

    for c in data:
        wc = w + bytes([c])
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            dictionary[wc] = code
            code += 1
            w = bytes([c])

    if w:
        result.append(dictionary[w])

    # Determine how many bytes needed
    max_code = max(result)
    if max_code <= 0xFFFF:
        width = 2
    elif max_code <= 0xFFFFFF:
        width = 3
    else:
        width = 4

    compressed = bytearray([width])  # store width

    for num in result:
        compressed.extend(num.to_bytes(width, "big"))

    return bytes(compressed)

def lzw_decompress(data: bytes) -> bytes:
    width = data[0]      # read width
    data = data[1:]

    codes = [
        int.from_bytes(data[i:i+width], "big")
        for i in range(0, len(data), width)
    ]

    dictionary = {i: bytes([i]) for i in range(256)}
    code = 256

    w = bytes([codes[0]])
    result = bytearray(w)

    for k in codes[1:]:
        if k in dictionary:
            entry = dictionary[k]
        elif k == code:
            entry = w + w[:1]
        else:
            raise ValueError("Bad LZW code")

        result.extend(entry)
        dictionary[code] = w + entry[:1]
        code += 1
        w = entry

    return bytes(result)

# ------------------------------
# SIMPLE HUFFMAN
# ------------------------------
class HuffmanNode:
    def __init__(self, freq, byte=None, left=None, right=None):
        self.freq = freq
        self.byte = byte
        self.left = left
        self.right = right
    def __lt__(self, other):
        return self.freq < other.freq

def build_huffman_tree(data: bytes):
    counter = Counter(data)
    heap = [HuffmanNode(freq, b) for b, freq in counter.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
        n1 = heapq.heappop(heap)
        n2 = heapq.heappop(heap)
        heapq.heappush(heap, HuffmanNode(n1.freq + n2.freq, None, n1, n2))

    return heap[0]

def build_huffman_codes(node, prefix="", codebook=None):
    if codebook is None:
        codebook = {}
    if node.byte is not None:
        codebook[node.byte] = prefix
    else:
        build_huffman_codes(node.left, prefix + "0", codebook)
        build_huffman_codes(node.right, prefix + "1", codebook)
    return codebook

def huffman_compress(data: bytes):
    tree = build_huffman_tree(data)
    codebook = build_huffman_codes(tree)
    bitstring = "".join(codebook[b] for b in data)
    extra = (8 - len(bitstring) % 8) % 8
    bitstring += "0" * extra
    compressed = bytearray(int(bitstring[i:i+8], 2) for i in range(0, len(bitstring), 8))
    return bytes(compressed), codebook

def huffman_decompress(compressed: bytes, codebook: dict) -> bytes:
    rev = {v: bytes([k]) for k, v in codebook.items()}
    bitstring = "".join(f"{b:08b}" for b in compressed)
    result = bytearray()

    code = ""
    for bit in bitstring:
        code += bit
        if code in rev:
            result.extend(rev[code])
            code = ""
    return bytes(result)

# ------------------------------
# DIRECTORIES
# ------------------------------
INPUT_FOLDER = r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files"
OUTPUT_CSV = r"C:\Users\mhars\Desktop\NITK Projects\IPC\Compressed info of different files\compression_results.csv"
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

files = [f for f in os.listdir(INPUT_FOLDER) if os.path.isfile(os.path.join(INPUT_FOLDER,f))]
print("Files to process:", files)

rows = []

# ------------------------------
# PROCESS ALL FILES
# ------------------------------
for i, fname in enumerate(files, 1):
    print(f"[{i}/{len(files)}] Processing {fname}...")
    fpath = os.path.join(INPUT_FOLDER, fname)

    with open(fpath, "rb") as f:
        data = f.read()

    orig_size = len(data)

    # -------- RLE --------
    t = time.time()
    rle_data = rle_compress(data)
    rle_time = time.time() - t
    rle_ok = (data == rle_decompress(rle_data))
    rle_size = len(rle_data)
    rle_ratio = orig_size / rle_size if rle_size else 0

    # -------- LZW --------
    t = time.time()
    lzw_data = lzw_compress(data)
    lzw_time = time.time() - t
    lzw_ok = (data == lzw_decompress(lzw_data))
    lzw_size = len(lzw_data)
    lzw_ratio = orig_size / lzw_size if lzw_size else 0

    # -------- Huffman --------
    t = time.time()
    huff_data, codebook = huffman_compress(data)
    huff_time = time.time() - t
    huff_ok = (data == huffman_decompress(huff_data, codebook))
    huff_size = len(huff_data)
    huff_ratio = orig_size / huff_size if huff_size else 0

    # -------- BEST --------
    size_dict = {
        "RLE": rle_size if rle_ok else float('inf'),
        "LZW": lzw_size if lzw_ok else float('inf'),
        "Huffman": huff_size if huff_ok else float('inf')
    }
    ratio_dict = {
        "RLE": rle_ratio if rle_ok else 0,
        "LZW": lzw_ratio if lzw_ok else 0,
        "Huffman": huff_ratio if huff_ok else 0
    }

    best_size = min(size_dict, key=size_dict.get)
    best_ratio = max(ratio_dict, key=ratio_dict.get)

    # -------- STORE RESULT --------
    rows.append({
        "file": fname,
        "original_size": orig_size,
        "RLE_size": rle_size,
        "RLE_ratio": rle_ratio,
        "RLE_time": rle_time,
        "RLE_ok": rle_ok,
        "LZW_size": lzw_size,
        "LZW_ratio": lzw_ratio,
        "LZW_time": lzw_time,
        "LZW_ok": lzw_ok,
        "Huffman_size": huff_size,
        "Huffman_ratio": huff_ratio,
        "Huffman_time": huff_time,
        "Huffman_ok": huff_ok,
        "best_by_size": best_size,
        "best_by_ratio": best_ratio
    })

# ------------------------------
# SAVE CSV
# ------------------------------
df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Compression results saved to {OUTPUT_CSV}")


Files to process: ['alice_in_wonderland.txt', 'Bumps and His Buddies.txt', 'Bumps and His Buddies.txt.lzw', 'Bumps and His Buddies_hybrid_compressed.bin', 'class.py', 'classtest2.3.py', 'combined_data.csv', 'c_cpp_properties.json', 'EEG_DATA.csv', 'EEG_DATA.csv.lzw', 'EEG_DATA_hybrid_compressed.bin', 'email_text.txt', 'launch.json', 'Rectangle.py', 'sensor_data.txt', 'sherlock_holmes.txt', 'system_log1.log', 'system_log2.log']
[1/18] Processing alice_in_wonderland.txt...
[2/18] Processing Bumps and His Buddies.txt...
[3/18] Processing Bumps and His Buddies.txt.lzw...
[4/18] Processing Bumps and His Buddies_hybrid_compressed.bin...
[5/18] Processing class.py...
[6/18] Processing classtest2.3.py...
[7/18] Processing combined_data.csv...


KeyboardInterrupt: 

In [4]:
import pandas as pd

# Load both datasets
features_df = pd.read_csv(r"C:\Users\mhars\Desktop\NITK Projects\IPC\Extracted features from files\features.csv")
compression_df = pd.read_csv(r"C:\Users\mhars\Desktop\NITK Projects\IPC\Compressed info of different files\compression_results.csv")

# Merge on filename
final_df = pd.merge(features_df, compression_df, on="file", how="inner")

# Save final merged file
output_path = r"C:\Users\mhars\Desktop\NITK Projects\IPC\final_merged_results.csv"
final_df.to_csv(output_path, index=False)

print("Merged CSV saved to:", output_path)


Merged CSV saved to: C:\Users\mhars\Desktop\NITK Projects\IPC\final_merged_results.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ================= LOAD DATA =================
df = pd.read_csv(r"C:\Users\mhars\Desktop\NITK Projects\IPC\final_merged_results.csv")

# strip column names (just in case)
df.columns = df.columns.str.strip()

# ========= SELECT TARGET LABEL HERE =========
target = "best_by_ratio"     # or "best_by_size"

# ========= EXTRACT TARGET BEFORE DROPPING =========
y = df[target]
df = df.drop(columns=[target])

# ========= DROP NON-NUMERIC METADATA =========
df = df.drop(columns=["file"], errors="ignore")

# ========= DROP COMPRESSION RESULT COLUMNS =========
bad_cols = [c for c in df.columns if 
            "size" in c.lower() or
            "ratio" in c.lower() or
            "time" in c.lower() or
            "_ok" in c.lower()
           ]

df = df.drop(columns=bad_cols, errors="ignore")

# ========= ENSURE ALL FEATURES ARE NUMERIC =========
X = df.apply(pd.to_numeric, errors="coerce").dropna()

# truncate y to match X length
y = y.iloc[X.index]

# ========= ENCODE TARGET =========
label_enc = LabelEncoder()
y = label_enc.fit_transform(y)

# ========= TRAIN TEST SPLIT =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========= TRAIN RANDOM FOREST =========
model = RandomForestClassifier(n_estimators=300)
model.fit(X_train, y_train)

# ========= EVALUATE =========
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:")
print(classification_report(y_test, y_pred))

# ========= LABEL MAPPING =========
print("\nLabel mapping:")
print(dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_))))

# ========= SAVE MODEL =========
joblib.dump(model, "best_algo_model.pkl")
joblib.dump(label_enc, "label_encoder.pkl")
joblib.dump(X.columns.tolist(), "feature_list.pkl")

print("\nModel saved successfully!")


In [15]:
##TO EXTRACT FEATURES FROM OUR TEST FILE

## TO EXTRACT FEATURES AND CREATE DATASET FOR THE MODEL
import math, os, csv
from collections import Counter

# ---------- helper feature functions ----------
def calculate_entropy(text):
    if not text: return 0
    freq = Counter(text)
    total = len(text)
    probs = [f / total for f in freq.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)

def unique_symbol_count(text):
    return len(set(text)) if text else 0

def mean_run_length(text):
    if not text: return 0
    runs, run = [], 1
    for i in range(1, len(text)):
        if text[i] == text[i - 1]:
            run += 1
        else:
            runs.append(run)
            run = 1
    runs.append(run)
    return sum(runs) / len(runs)

def max_run_length(text):
    if not text: return 0
    max_run, run = 1, 1
    for i in range(1, len(text)):
        if text[i] == text[i - 1]:
            run += 1
            max_run = max(max_run, run)
        else:
            run = 1
    return max_run

def symbol_frequency_variance(text):
    if not text: return 0
    freq = Counter(text)
    values = list(freq.values())
    mean_val = sum(values) / len(values)
    return sum((v - mean_val) ** 2 for v in values) / len(values)

def bigram_repeat_rate(text):
    if len(text) < 2: return 0
    bigrams = [text[i:i+2] for i in range(len(text) - 1)]
    freq = Counter(bigrams)
    repeats = sum(1 for v in freq.values() if v > 1)
    return repeats / len(freq)

def trigram_repeat_rate(text):
    if len(text) < 3: return 0
    trigrams = [text[i:i+3] for i in range(len(text) - 2)]
    freq = Counter(trigrams)
    repeats = sum(1 for v in freq.values() if v > 1)
    return repeats / len(freq)

def character_repetition_ratio(text):
    if not text: return 0
    repeated = sum(1 for i in range(1, len(text)) if text[i] == text[i - 1])
    return repeated / len(text)

def avg_word_length(text):
    words = [w for w in text.split() if w]
    if not words: return 0
    return sum(len(w) for w in words) / len(words)

def whitespace_ratio(text):
    if not text: return 0
    whites = sum(1 for c in text if c.isspace())
    return whites / len(text)


def extract_features(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    if not text:
        return None

    feats = {
        "File": os.path.basename(file_path),
        "Entropy": calculate_entropy(text),
        "UniqueSymbols": unique_symbol_count(text),
        "MeanRunLength": mean_run_length(text),
        "MaxRunLength": max_run_length(text),
        "SymbolFreqVariance": symbol_frequency_variance(text),
        "BigramRepeatRate": bigram_repeat_rate(text),
        "TrigramRepeatRate": trigram_repeat_rate(text),
        "CharacterRepetitionRatio": character_repetition_ratio(text),
        "AvgWordLength": avg_word_length(text),
        "WhitespaceRatio": whitespace_ratio(text),
        "FileSize": os.path.getsize(file_path)
    }
    return feats

In [11]:
##TO COMPRESS THE DATA ALL THREE COMPRESSORS

import os
import time
import pandas as pd
from collections import Counter
import heapq

# ------------------------------
# SIMPLE RLE
# ------------------------------
def rle_compress(data: bytes) -> bytes:
    compressed = bytearray()
    i = 0
    while i < len(data):
        count = 1
        while i + 1 < len(data) and data[i] == data[i+1] and count < 255:
            i += 1
            count += 1
        compressed.extend([count, data[i]])
        i += 1
    return bytes(compressed)

def rle_decompress(data: bytes) -> bytes:
    decompressed = bytearray()
    i = 0
    while i < len(data):
        count = data[i]
        value = data[i+1]
        decompressed.extend([value] * count)
        i += 2
    return bytes(decompressed)

# ------------------------------
# FIXED LZW (NO OVERFLOW)
# ------------------------------
def lzw_compress(data: bytes) -> bytes:
    dictionary = {bytes([i]): i for i in range(256)}
    w = b""
    result = []
    code = 256

    for c in data:
        wc = w + bytes([c])
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            dictionary[wc] = code
            code += 1
            w = bytes([c])

    if w:
        result.append(dictionary[w])

    # Determine how many bytes needed
    max_code = max(result)
    if max_code <= 0xFFFF:
        width = 2
    elif max_code <= 0xFFFFFF:
        width = 3
    else:
        width = 4

    compressed = bytearray([width])  # store width

    for num in result:
        compressed.extend(num.to_bytes(width, "big"))

    return bytes(compressed)


def lzw_decompress(data: bytes) -> bytes:
    width = data[0]      # read width
    data = data[1:]

    codes = [
        int.from_bytes(data[i:i+width], "big")
        for i in range(0, len(data), width)
    ]

    dictionary = {i: bytes([i]) for i in range(256)}
    code = 256

    w = bytes([codes[0]])
    result = bytearray(w)

    for k in codes[1:]:
        if k in dictionary:
            entry = dictionary[k]
        elif k == code:
            entry = w + w[:1]
        else:
            raise ValueError("Bad LZW code")

        result.extend(entry)
        dictionary[code] = w + entry[:1]
        code += 1
        w = entry

    return bytes(result)

# ------------------------------
# SIMPLE HUFFMAN
# ------------------------------
class HuffmanNode:
    def __init__(self, freq, byte=None, left=None, right=None):
        self.freq = freq
        self.byte = byte
        self.left = left
        self.right = right
    def __lt__(self, other):
        return self.freq < other.freq

def build_huffman_tree(data: bytes):
    counter = Counter(data)
    heap = [HuffmanNode(freq, b) for b, freq in counter.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
        n1 = heapq.heappop(heap)
        n2 = heapq.heappop(heap)
        heapq.heappush(heap, HuffmanNode(n1.freq + n2.freq, None, n1, n2))

    return heap[0]

def build_huffman_codes(node, prefix="", codebook=None):
    if codebook is None:
        codebook = {}
    if node.byte is not None:
        codebook[node.byte] = prefix
    else:
        build_huffman_codes(node.left, prefix + "0", codebook)
        build_huffman_codes(node.right, prefix + "1", codebook)
    return codebook

def huffman_compress(data: bytes):
    tree = build_huffman_tree(data)
    codebook = build_huffman_codes(tree)
    bitstring = "".join(codebook[b] for b in data)
    extra = (8 - len(bitstring) % 8) % 8
    bitstring += "0" * extra
    compressed = bytearray(int(bitstring[i:i+8], 2) for i in range(0, len(bitstring), 8))
    return bytes(compressed), codebook

def huffman_decompress(compressed: bytes, codebook: dict) -> bytes:
    rev = {v: bytes([k]) for k, v in codebook.items()}
    bitstring = "".join(f"{b:08b}" for b in compressed)
    result = bytearray()

    code = ""
    for bit in bitstring:
        code += bit
        if code in rev:
            result.extend(rev[code])
            code = ""
    return bytes(result)



In [12]:
## TESTING USING OUR FILE

import joblib
import pandas as pd

# ========= LOAD TRAINED MODEL + ENCODER =========
model = joblib.load("best_algo_model.pkl")
label_enc = joblib.load("label_encoder.pkl")
feature_list = joblib.load("feature_list.pkl")


# ========= FINAL PREDICT + COMPRESS FUNCTION =========

def predict_and_compress(file_path):

    # ---- READ THE FILE ----
    with open(file_path, "rb") as f:
        raw_data = f.read()

    # ---- Extract features using YOUR function ----
    feats = extract_features(file_path)
    if feats is None:
        print("‚ùå Could not extract features.")
        return

    # ---- Convert to DataFrame with correct column order ----
    X_new = pd.DataFrame([feats])[feature_list]

    # ---- Predict best algorithm ----
    pred = model.predict(X_new)[0]
    algo = label_enc.inverse_transform([pred])[0]

    print("Best algorithm predicted:", algo)

    # ---- Run correct compression ----
    if algo == "RLE":
        compressed = rle_compress(raw_data)
        out_file = file_path + ".rle"

    elif algo == "LZW":
        compressed = lzw_compress(raw_data)
        out_file = file_path + ".lzw"

    elif algo == "Huffman":
        compressed, _ = huffman_compress(raw_data)
        out_file = file_path + ".huff"

    # ---- Save compressed output ----
    with open(out_file, "wb") as f:
        f.write(compressed)

    print("Compressed file saved as:", out_file)
    print("Original size:", len(raw_data), "bytes")
    print("Compressed size:", len(compressed), "bytes")
    print("Compression ratio:", len(raw_data) / len(compressed))

    return algo, out_file


In [13]:
import os

def hybrid_rle_lzw_huffman(file_path):
    with open(file_path, "rb") as f:
        data = f.read()

    original_size = len(data)

    print("STEP 1: RLE Compression...")
    rle_data = rle_compress(data)
    rle_size = len(rle_data)
    rle_ratio = original_size / rle_size if rle_size else 0

    print(f" - RLE compressed size: {rle_size} bytes")
    print(f" - RLE compression ratio: {rle_ratio}")

    print("\nSTEP 2: LZW Compression on RLE output...")
    lzw_data = lzw_compress(rle_data)
    lzw_size = len(lzw_data)
    lzw_ratio = original_size / lzw_size if lzw_size else 0

    print(f" - LZW compressed size: {lzw_size} bytes")
    print(f" - LZW compression ratio: {lzw_ratio}")

    print("\nSTEP 3: Huffman Compression on LZW output...")
    huff_data, codebook = huffman_compress(lzw_data)
    huff_size = len(huff_data)
    huff_ratio = original_size / huff_size if huff_size else 0

    print(f" - Huffman compressed size: {huff_size} bytes")
    print(f" - Huffman compression ratio: {huff_ratio}")

    # ==== SAVE ONLY FINAL COMPRESSED FILE ====
    save_path = os.path.splitext(file_path)[0] + "_hybrid_compressed.bin"
    with open(save_path, "wb") as f:
        f.write(huff_data)

    print(f"\nFinal compressed file saved as:\n {save_path}")

    print("\n==============================")
    print(" FINAL HYBRID COMPRESSION RESULT ")
    print("==============================")
    print(f"Original size: {original_size} bytes")
    print(f"Final compressed size: {huff_size} bytes")
    print(f"Final compression ratio: {huff_ratio}")

    return {
        "original_size": original_size,
        "after_rle_size": rle_size,
        "after_lzw_size": lzw_size,
        "final_huffman_size": huff_size,
        "final_ratio": huff_ratio,
        "compressed_file": save_path
    }


In [14]:
#TEST1 ON .TXT FILE USING PROPOSED METHOD

predict_and_compress(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\Bumps and His Buddies.txt"
)


Best algorithm predicted: LZW
Compressed file saved as: C:\Users\mhars\Desktop\NITK Projects\IPC\Files\Bumps and His Buddies.txt.lzw
Original size: 76295 bytes
Compressed size: 41197 bytes
Compression ratio: 1.8519552394591838


('LZW',
 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\Bumps and His Buddies.txt.lzw')

In [15]:
#TEST1 ON .TXT FILE USING HYBRID METHOD

hybrid_rle_lzw_huffman(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\Bumps and His Buddies.txt"
)


STEP 1: RLE Compression...
 - RLE compressed size: 148370 bytes
 - RLE compression ratio: 0.5142212037473883

STEP 2: LZW Compression on RLE output...
 - LZW compressed size: 48687 bytes
 - LZW compression ratio: 1.5670507527676794

STEP 3: Huffman Compression on LZW output...
 - Huffman compressed size: 45853 bytes
 - Huffman compression ratio: 1.6639042156456503

Final compressed file saved as:
 C:\Users\mhars\Desktop\NITK Projects\IPC\Files\Bumps and His Buddies_hybrid_compressed.bin

 FINAL HYBRID COMPRESSION RESULT 
Original size: 76295 bytes
Final compressed size: 45853 bytes
Final compression ratio: 1.6639042156456503


{'original_size': 76295,
 'after_rle_size': 148370,
 'after_lzw_size': 48687,
 'final_huffman_size': 45853,
 'final_ratio': 1.6639042156456503,
 'compressed_file': 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\Bumps and His Buddies_hybrid_compressed.bin'}

In [16]:
##TEST2 ON CSV FILE

predict_and_compress(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\EEG_DATA.csv"
)
hybrid_rle_lzw_huffman(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\EEG_DATA.csv"
)


Best algorithm predicted: LZW
Compressed file saved as: C:\Users\mhars\Desktop\NITK Projects\IPC\Files\EEG_DATA.csv.lzw
Original size: 151743556 bytes
Compressed size: 78020477 bytes
Compression ratio: 1.9449196138598333
STEP 1: RLE Compression...
 - RLE compressed size: 291224492 bytes
 - RLE compression ratio: 0.521053552048088

STEP 2: LZW Compression on RLE output...
 - LZW compressed size: 83559345 bytes
 - LZW compression ratio: 1.8159974327228152

STEP 3: Huffman Compression on LZW output...
 - Huffman compressed size: 71113585 bytes
 - Huffman compression ratio: 2.133819522669262

Final compressed file saved as:
 C:\Users\mhars\Desktop\NITK Projects\IPC\Files\EEG_DATA_hybrid_compressed.bin

 FINAL HYBRID COMPRESSION RESULT 
Original size: 151743556 bytes
Final compressed size: 71113585 bytes
Final compression ratio: 2.133819522669262


{'original_size': 151743556,
 'after_rle_size': 291224492,
 'after_lzw_size': 83559345,
 'final_huffman_size': 71113585,
 'final_ratio': 2.133819522669262,
 'compressed_file': 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\EEG_DATA_hybrid_compressed.bin'}

In [16]:
##TEST3 ON .py FILE

predict_and_compress(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\class.py"
)
hybrid_rle_lzw_huffman(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\class.py"
)


Best algorithm predicted: Huffman
Compressed file saved as: C:\Users\mhars\Desktop\NITK Projects\IPC\Files\class.py.huff
Original size: 552 bytes
Compressed size: 325 bytes
Compression ratio: 1.6984615384615385
STEP 1: RLE Compression...
 - RLE compressed size: 956 bytes
 - RLE compression ratio: 0.5774058577405857

STEP 2: LZW Compression on RLE output...
 - LZW compressed size: 807 bytes
 - LZW compression ratio: 0.6840148698884758

STEP 3: Huffman Compression on LZW output...
 - Huffman compressed size: 491 bytes
 - Huffman compression ratio: 1.124236252545825

Final compressed file saved as:
 C:\Users\mhars\Desktop\NITK Projects\IPC\Files\class_hybrid_compressed.bin

 FINAL HYBRID COMPRESSION RESULT 
Original size: 552 bytes
Final compressed size: 491 bytes
Final compression ratio: 1.124236252545825


{'original_size': 552,
 'after_rle_size': 956,
 'after_lzw_size': 807,
 'final_huffman_size': 491,
 'final_ratio': 1.124236252545825,
 'compressed_file': 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\class_hybrid_compressed.bin'}

In [17]:
##TEST4 ON .log FILE

predict_and_compress(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\system_log1.log"
)
hybrid_rle_lzw_huffman(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\system_log1.log"
)


Best algorithm predicted: Huffman
Compressed file saved as: C:\Users\mhars\Desktop\NITK Projects\IPC\Files\system_log1.log.huff
Original size: 996 bytes
Compressed size: 631 bytes
Compression ratio: 1.578446909667195
STEP 1: RLE Compression...
 - RLE compressed size: 1862 bytes
 - RLE compression ratio: 0.5349087003222341

STEP 2: LZW Compression on RLE output...
 - LZW compressed size: 1293 bytes
 - LZW compression ratio: 0.7703016241299304

STEP 3: Huffman Compression on LZW output...
 - Huffman compressed size: 856 bytes
 - Huffman compression ratio: 1.1635514018691588

Final compressed file saved as:
 C:\Users\mhars\Desktop\NITK Projects\IPC\Files\system_log1_hybrid_compressed.bin

 FINAL HYBRID COMPRESSION RESULT 
Original size: 996 bytes
Final compressed size: 856 bytes
Final compression ratio: 1.1635514018691588


{'original_size': 996,
 'after_rle_size': 1862,
 'after_lzw_size': 1293,
 'final_huffman_size': 856,
 'final_ratio': 1.1635514018691588,
 'compressed_file': 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\system_log1_hybrid_compressed.bin'}

In [18]:
##TEST4 ON .log FILE

predict_and_compress(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\launch.json"
)
hybrid_rle_lzw_huffman(
    r"C:\Users\mhars\Desktop\NITK Projects\IPC\Files\launch.json"
)


Best algorithm predicted: Huffman
Compressed file saved as: C:\Users\mhars\Desktop\NITK Projects\IPC\Files\launch.json.huff
Original size: 609 bytes
Compressed size: 342 bytes
Compression ratio: 1.780701754385965
STEP 1: RLE Compression...
 - RLE compressed size: 978 bytes
 - RLE compression ratio: 0.6226993865030674

STEP 2: LZW Compression on RLE output...
 - LZW compressed size: 853 bytes
 - LZW compression ratio: 0.7139507620164126

STEP 3: Huffman Compression on LZW output...
 - Huffman compressed size: 509 bytes
 - Huffman compression ratio: 1.1964636542239686

Final compressed file saved as:
 C:\Users\mhars\Desktop\NITK Projects\IPC\Files\launch_hybrid_compressed.bin

 FINAL HYBRID COMPRESSION RESULT 
Original size: 609 bytes
Final compressed size: 509 bytes
Final compression ratio: 1.1964636542239686


{'original_size': 609,
 'after_rle_size': 978,
 'after_lzw_size': 853,
 'final_huffman_size': 509,
 'final_ratio': 1.1964636542239686,
 'compressed_file': 'C:\\Users\\mhars\\Desktop\\NITK Projects\\IPC\\Files\\launch_hybrid_compressed.bin'}