# BENCHMARKING 5 DATASET

In [None]:
# import library di collab
import time
import pandas as pd
import sys
import os
from google.colab import drive
from IPython import get_ipython
from IPython.display import display

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Dataset paths
github_datasets = {
    'penjualan barang.csv': 'https://raw.githubusercontent.com/Anggunsky/datasetPenjualan/main/penjualan%20barang.csv',
    'data_transaksi.csv': 'https://raw.githubusercontent.com/annisareida/Benchmarking-Algoritma/main/data_transaksi.csv',
}
drive_datasets = {
    'synthetic (1).csv': '/content/drive/MyDrive/DATASET/synthetic (1).csv',
    'Dataset Penjualan Buku.csv': '/content/drive/MyDrive/DATASET/Dataset Penjualan Buku.csv',
    'dataset_tiruan.xlsx': '/content/drive/MyDrive/DATASET/dataset_tiruan.xlsx',
    }
all_datasets = {**github_datasets, **drive_datasets}

In [None]:
# FUNCTION UTILITIES
def build_hash_index(data, key):
    index = {}
    for item in data:
        value = item.get(key)
        if value is not None:
            if value in index:
                index[value].append(item)
            else:
                index[value] = [item]
    return index

In [None]:
def hash_search(index, value):
    return index.get(value, [])

In [None]:
def jump_search(data, key, target):
    n = len(data)
    step = int(n ** 0.5)
    prev = 0
    while prev < n and data[min(n-1, prev)].get(key) < target:
        prev += step
    for i in range(max(0, prev - step), min(prev + 1, n)):
        if data[i].get(key) == target:
            return [data[i]]  # Return as list for consistency with hash_search
    return []

In [None]:
def bubble_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        for j in range(0, n-i-1):
            if data_copy[j].get(key) is not None and data_copy[j+1].get(key) is not None:
                if data_copy[j][key] > data_copy[j+1][key]:
                    data_copy[j], data_copy[j+1] = data_copy[j+1], data_copy[j]
    return data_copy

In [None]:
def selection_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        min_idx = i
        for j in range(i+1, n):
            if data_copy[j].get(key) is not None and data_copy[min_idx].get(key) is not None:
                if data_copy[j][key] < data_copy[min_idx][key]:
                    min_idx = j
        data_copy[i], data_copy[min_idx] = data_copy[min_idx], data_copy[i]
    return data_copy

In [None]:
def hash_bubble(data, key):
    # Build hash index and sort each group using bubble sort
    index = build_hash_index(data, key)
    sorted_data = []
    for value in sorted(index.keys()):
        sorted_group = bubble_sort(index[value], key)
        sorted_data.extend(sorted_group)
    return sorted_data

In [None]:
def hash_selection(data, key):
    # Build hash index and sort each group using selection sort
    index = build_hash_index(data, key)
    sorted_data = []
    for value in sorted(index.keys()):
        sorted_group = selection_sort(index[value], key)
        sorted_data.extend(sorted_group)
    return sorted_data

In [None]:
def jump_bubble(data, key, target):
    # Sort using bubble sort, then perform jump search
    sorted_data = bubble_sort(data, key)
    return jump_search(sorted_data, key, target)

In [None]:
def jump_selection(data, key, target):
    # Sort using selection sort, then perform jump search
    sorted_data = selection_sort(data, key)
    return jump_search(sorted_data, key, target)

In [None]:
def memory_complexity(data):
    return sys.getsizeof(data)

In [None]:
# BENCHMARKING FUNCTION
def benchmark(transactions):
    print("\n==================== PENGUJIAN ALGORITMA PENCARIAN DAN PENGURUTAN ====================")

    keys = ['ID', 'Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian']
    search_targets = {
        'ID': 30,
        'Nama Pelanggan': 'TOKO HERUNIAWATI',
        'Tanggal Pembelian': '01/02/2020',
        'Jumlah Pembelian': 9840000.0
    }
    # Adjust search targets for data_transaksi.csv
    if any(t['Tanggal Pembelian'].startswith('202') for t in transactions[:5] if isinstance(t['Tanggal Pembelian'], str)):
        search_targets.update({
            'Nama Pelanggan': 'OMODF',
            'Tanggal Pembelian': '28/10/2024',
            'Jumlah Pembelian': 839.53
        })

    for key in keys:
        print(f"\n--- Benchmarking untuk Kolom: {key} ---")

        # Hash Search
        print(f"\n[1] Hash Search (by {key})...")
        start = time.time()
        index = build_hash_index(transactions, key)
        result = hash_search(index, search_targets[key])
        end = time.time()
        print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
        print(f"Waktu Hash Search: {end - start:.6f} detik")
        print(f"Ruang Memori: {memory_complexity(index)} bytes")

        # Jump Search
        print(f"\n[2] Jump Search (by {key})...")
        try:
            sorted_txn = sorted(transactions, key=lambda x: x.get(key, '') if x.get(key) is not None else '')
            start = time.time()
            result = jump_search(sorted_txn, key, search_targets[key])
            end = time.time()
            print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
            print(f"Waktu Jump Search: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_txn)} bytes")
        except TypeError as e:
            print(f"⚠️ Jump Search (by {key}) failed due to data type issues: {e}")

        # Bubble Sort
        print(f"\n[3] Bubble Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = bubble_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {sorted_data[:5]}")
            print(f"Waktu Bubble Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Bubble Sort (by {key}) failed due to data type issues: {e}")

        # Selection Sort
        print(f"\n[4] Selection Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = selection_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {sorted_data[:5]}")
            print(f"Waktu Selection Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Selection Sort (by {key}) failed due to data type issues: {e}")

        # Hash-Bubble
        print(f"\n[5] Hash-Bubble (by {key})...")
        try:
            start = time.time()
            sorted_data = hash_bubble(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {sorted_data[:5]}")
            print(f"Waktu Hash-Bubble: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Hash-Bubble (by {key}) failed due to data type issues: {e}")

        # Hash-Selection
        print(f"\n[6] Hash-Selection (by {key})...")
        try:
            start = time.time()
            sorted_data = hash_selection(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {sorted_data[:5]}")
            print(f"Waktu Hash-Selection: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Hash-Selection (by {key}) failed due to data type issues: {e}")

        # Jump-Bubble
        print(f"\n[7] Jump-Bubble (by {key})...")
        try:
            start = time.time()
            result = jump_bubble(transactions, key, search_targets[key])
            end = time.time()
            print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
            print(f"Waktu Jump-Bubble: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(transactions)} bytes")
        except TypeError as e:
            print(f"⚠️ Jump-Bubble (by {key}) failed due to data type issues: {e}")

        # Jump-Selection
        print(f"\n[8] Jump-Selection (by {key})...")
        try:
            start = time.time()
            result = jump_selection(transactions, key, search_targets[key])
            end = time.time()
            print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
            print(f"Waktu Jump-Selection: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(transactions)} bytes")
        except TypeError as e:
            print(f"⚠️ Jump-Selection (by {key}) failed due to data type issues: {e}")

In [None]:
def normalize_columns(df, filename):
    # Column mappings based on actual dataset structures
    if filename == 'penjualan barang.csv':
        mapping = {
            'Unnamed: 0': 'ID',
            'nama.pembeli': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'nominal': 'Jumlah Pembelian'
        }
        df['nominal'] = pd.to_numeric(df.get('nominal', pd.Series()), errors='coerce')
    elif filename == 'data_transaksi.csv':
        mapping = {
            'id': 'ID',
            'nama': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'jumlah': 'Jumlah Pembelian'
        }
        # Avoid forcing ID to numeric to preserve data
        df['jumlah'] = pd.to_numeric(df.get('jumlah', pd.Series()), errors='coerce')
    elif filename == 'Dataset Penjualan Buku.csv':
        mapping = {
            'ID': 'ID',
            'Nama_Pelanggan': 'Nama Pelanggan',
            'Tanggal_Pembelian': 'Tanggal Pembelian',
            'Jumlah_Pembelian': 'Jumlah Pembelian'
        }
        df['Jumlah_Pembelian'] = pd.to_numeric(df.get('Jumlah_Pembelian', pd.Series()), errors='coerce')
    elif filename == 'synthetic (1).csv':
        mapping = {
            'ID Pelanggan': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Waktu Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        # Avoid forcing ID to numeric to preserve data
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    elif filename == 'dataset_tiruan.xlsx':
        mapping = {
            'ID': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Tanggal Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    else:
        mapping = {}
        print(f"ℹ️ Tidak ada mapping kolom khusus untuk {filename}. Menggunakan kolom asli.")

    if mapping:
        valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
        if valid_mapping:
            df = df.rename(columns=valid_mapping)
        else:
            print(f"⚠️ Tidak ada kolom yang cocok dengan mapping untuk {filename}. Kolom yang ada: {list(df.columns)}")
    return df

In [None]:
# MAIN
if __name__ == "__main__":
    for filename, path in all_datasets.items():
        print(f"\n================== MEMUAT DATASET: {filename} ==================")
        try:
            if path.endswith('.xlsx'):
                df = pd.read_excel(path)
            else:
                df = pd.read_csv(path)

            df.columns = [col.strip() for col in df.columns]
            print(f"Kolom asli: {list(df.columns)}")
            df = normalize_columns(df, filename)
            expected_columns = ['ID', 'Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian']

            if all(col in df.columns for col in expected_columns):
                df = df[expected_columns].copy()
                # Only convert Jumlah Pembelian to numeric
                df['Jumlah Pembelian'] = pd.to_numeric(df['Jumlah Pembelian'], errors='coerce')
                # Handle date parsing
                if filename == 'data_transaksi.csv':
                    df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], format='%Y-%m-%d', errors='coerce').dt.strftime('%d/%m/%Y')
                else:
                    df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], errors='coerce').dt.strftime('%d/%m/%Y')

                # Log rows with missing values before dropping
                missing_rows = df[df[expected_columns].isna().any(axis=1)]
                if not missing_rows.empty:
                    print(f"⚠️ {len(missing_rows)} baris dengan nilai kosong di kolom kunci: {missing_rows.head().to_dict(orient='records')}")

                # Drop rows only if critical columns are missing (exclude ID for data_transaksi.csv)
                if filename == 'data_transaksi.csv':
                    df = df.dropna(subset=['Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian'])
                else:
                    df = df.dropna(subset=expected_columns)

                transaksi_data = df.to_dict(orient='records')
                print(f"✅ Jumlah Data: {len(transaksi_data)}")
                print("Contoh Data (5 data pertama):")
                for txn in transaksi_data[:5]:
                    print(txn)

                if transaksi_data:
                    benchmark(transaksi_data)
                else:
                    print(f"⚠️ Tidak ada data transaksi yang valid di {filename} setelah pemrosesan. Melewati benchmark.")
            else:
                missing_cols = [col for col in expected_columns if col not in df.columns]
                print(f"❌ Dataset {filename} tidak memiliki kolom yang sesuai setelah normalisasi. Kolom yang hilang: {missing_cols}")
                print(f"Kolom yang ada: {list(df.columns)}")
        except FileNotFoundError:
            print(f"❌ Gagal memproses dataset {filename}: File tidak ditemukan di lokasi: {path}")
        except Exception as e:
            print(f"❌ Gagal memproses dataset {filename}: {e}")


Kolom asli: ['Unnamed: 0', 'tanggal', 'nama.pembeli', 'nama.barang', 'kuantum', 'nominal']
✅ Jumlah Data: 1289
Contoh Data (5 data pertama):
{'ID': 1, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 9840000.0}
{'ID': 2, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 8400000.0}
{'ID': 3, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 62910000.0}
{'ID': 4, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 4855200.0}
{'ID': 5, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 1162000.0}


--- Benchmarking untuk Kolom: ID ---

[1] Hash Search (by ID)...
Hasil: [{'ID': 30, 'Nama Pelanggan': 'TOKO HARYATI ', 'Tanggal Pembelian': '10/01/2020', 'Jumlah Pembelian': 31500000.0}]
Waktu Hash Search: 0.000508 detik
Ruang Memori: 36952 bytes

[2] Jump Search (by ID

In [None]:
import pandas as pd
import math
import time

# Fungsi untuk ekstrak ID numerik dari string
def extract_numeric_id(id_str):
    try:
        return int(id_str.replace("TRX", ""))
    except:
        return -1  # Tangani ID tidak valid

# Fungsi Jump Search (as found in the original notebook)
# This function expects a sorted list/array of numeric values
def jump_search(arr, target):
    n = len(arr)
    if n == 0: # Handle empty array case
        return -1
    step = int(math.sqrt(n))
    prev = 0
    # Find the block where target might be present
    while prev < n and arr[min(step, n) - 1] < target:
        prev = step
        step += int(math.sqrt(n))
        if prev >= n:
            return -1 # Target is beyond the last block

    # Perform linear search within the block
    while prev < min(step, n): # Iterate up to the block boundary
        if arr[prev] == target:
            return prev # Target found
        prev += 1

    return -1 # Target not found

# Fungsi Jump-Bubble Sort (as found in the original notebook)
# This function expects a list of dictionaries and a key
def jump_bubble_sort(arr, key):
    n = len(arr)
    if n == 0:
        return arr
    step = int(math.sqrt(n))
    swapped = True
    # Perform jump swaps
    while swapped:
        swapped = False
        for i in range(0, n-step):
             # Ensure keys exist and values are comparable
            val1 = arr[i].get(key)
            val2 = arr[i+step].get(key)
            # Handle None or incomparable types gracefully
            if val1 is not None and val2 is not None:
                 # Attempt comparison, handle potential TypeError
                 try:
                     if val1 > val2:
                         arr[i], arr[i+step] = arr[i+step], arr[i]
                         swapped = True
                 except TypeError:
                     print(f"Warning: Cannot compare values for key '{key}' at indices {i} and {i+step}. Skipping swap.")


    # Bubble sort standard for remaining elements within blocks
    # This part seems inefficient and potentially incorrect for jump-bubble logic.
    # A typical jump-bubble would refine sorts within blocks or use a smaller step size.
    # However, based on the provided code's apparent intent:
    # Perform a final bubble sort on the partially sorted array
    for i in range(n-1):
        for j in range(0, n-i-1):
             # Ensure keys exist and values are comparable
            val1 = arr[j].get(key)
            val2 = arr[j+1].get(key)
            if val1 is not None and val2 is not None:
                try:
                    if val1 > val2:
                        arr[j], arr[j+1] = arr[j+1], arr[j]
                except TypeError:
                     print(f"Warning: Cannot compare values for key '{key}' at indices {j} and {j+1}. Skipping swap.")
    return arr


# Fungsi Jump-Selection Sort (as found in the original notebook)
# This function expects a list of dictionaries and a key
def jump_selection_sort(arr, key):
    n = len(arr)
    if n == 0:
        return arr
    step = int(math.sqrt(n))
    for i in range(n):
        min_idx = i
        # Find minimum within jump steps
        for j in range(i+step, n, step):
            # Ensure keys exist and values are comparable
            val_j = arr[j].get(key)
            val_min = arr[min_idx].get(key)
            if val_j is not None and val_min is not None:
                 try:
                     if val_j < val_min:
                         min_idx = j
                 except TypeError:
                    print(f"Warning: Cannot compare values for key '{key}' at index {j} and {min_idx}. Skipping comparison.")

        # Periksa sisa elemen dalam langkah (linear scan in the final block or refine search)
        # This part is also unusual for a standard jump-selection.
        # It seems to perform a linear scan up to the next step boundary after finding a jump-based minimum.
        # A more standard approach might be to find the minimum in the block starting at i.
        # Based on the provided code's apparent intent:
        # Check elements between i and the next step boundary for the minimum
        for j in range(i+1, min(i+step, n)):
             # Ensure keys exist and values are comparable
            val_j = arr[j].get(key)
            val_min = arr[min_idx].get(key)
            if val_j is not None and val_min is not None:
                 try:
                    if val_j < val_min:
                        min_idx = j
                 except TypeError:
                    print(f"Warning: Cannot compare values for key '{key}' at index {j} and {min_idx}. Skipping comparison.")

        # Swap the found minimum with the element at index i
        arr[i], arr[min_idx] = arr[min_idx], arr[i]
    return arr


# Fungsi untuk memproses dan menjalankan algoritma
def process_dataset(dataset_path, dataset_name):
    print(f"\nMemproses {dataset_name}")

    try:
        # Baca dataset
        if dataset_path.endswith('.xlsx'):
             df = pd.read_excel(dataset_path)
        else:
             df = pd.read_csv(dataset_path)

        # Normalisasi nama kolom (sesuai dengan logika di kode sebelumnya)
        df.columns = [col.strip() for col in df.columns]
        df = normalize_columns(df, dataset_name) # Use the previously defined normalize_columns

        # Ensure required columns exist after normalization
        expected_cols = ['ID', 'Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian']
        if not all(col in df.columns for col in expected_cols):
             missing = [col for col in expected_cols if col not in df.columns]
             print(f"❌ Skipping {dataset_name}: Missing expected columns after normalization: {missing}")
             print(f"Available columns: {list(df.columns)}")
             return # Skip if columns are not as expected

        # Select and process relevant columns
        df = df[expected_cols].copy()

        # Convert 'Jumlah Pembelian' to numeric, coercing errors
        df['Jumlah Pembelian'] = pd.to_numeric(df['Jumlah Pembelian'], errors='coerce')

        # Ensure 'ID' column exists before attempting to extract numeric ID
        if 'ID' not in df.columns:
             print(f"❌ Skipping {dataset_name}: 'ID' column not found after normalization. Cannot proceed with ID-based search/sort.")
             return

        # Extract numeric ID only if the column exists
        # Handle potential non-string IDs or missing IDs before applying replace
        df['ID_Numeric'] = df['ID'].apply(lambda x: extract_numeric_id(str(x)) if pd.notna(x) else -1)

        # Drop rows where critical columns are missing after conversion/extraction
        # For data_transaksi.csv, exclude ID from essential dropna subset based on previous logic
        subset_cols_for_dropna = ['Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian']
        if dataset_name != 'data_transaksi.csv':
             subset_cols_for_dropna.append('ID') # Include ID for other datasets if needed

        # Ensure columns exist before dropping
        subset_cols_for_dropna = [col for col in subset_cols_for_dropna if col in df.columns]

        if subset_cols_for_dropna:
             initial_rows = len(df)
             df.dropna(subset=subset_cols_for_dropna, inplace=True)
             if len(df) < initial_rows:
                 print(f"⚠️ Dropped {initial_rows - len(df)} rows with missing values in {subset_cols_for_dropna}.")

        # Check for invalid ID_Numeric values after dropping rows
        if (df['ID_Numeric'] == -1).any():
            print(f"⚠️ {dataset_name}: Terdapat ID tidak valid (-1) setelah pemrosesan. Baris ini mungkin akan memengaruhi hasil pencarian/pengurutan berbasis ID.")
            # Option: df = df[df['ID_Numeric'] != -1].copy() # Uncomment if you want to remove these rows

        # Convert to list of dictionaries for custom algorithms
        data_list = df.to_dict('records')

        # Proceed only if there is data after cleaning
        if not data_list:
            print(f"⚠️ No valid data remaining in {dataset_name} after processing. Skipping algorithms.")
            return

        print(f"✅ Processed data count: {len(data_list)}")

        # 1. Jump Search (by ID_Numeric)
        # Jump search as implemented here expects a sorted list of *numeric* values
        # So we need to sort by 'ID_Numeric' first and pass only the numeric IDs.
        print("\nMenjalankan Jump Search (by ID_Numeric)...")
        # Sort data_list by ID_Numeric to prepare for Jump Search on the numeric IDs
        sorted_data_list_id_numeric = sorted(data_list, key=lambda x: x.get('ID_Numeric', -1)) # Use -1 for None/missing ID_Numeric
        id_numeric_values = [item.get('ID_Numeric', -1) for item in sorted_data_list_id_numeric] # Extract only the numeric IDs

        # Define target ID_Numeric based on the dataset
        target_id_numeric = None
        if dataset_name == 'data_transaksi.csv':
             # Assuming 'data_transaksi.csv' IDs are like TRX1, TRX2...
             # Let's pick a sample numeric ID. Find one from the actual data if possible.
             if id_numeric_values:
                 sample_ids = [id for id in id_numeric_values if id != -1][:5] # Get some valid sample IDs
                 if sample_ids:
                    target_id_numeric = sample_ids[0] # Use the first valid sample as target
                    print(f"Using sample target ID_Numeric for {dataset_name}: {target_id_numeric}")
                 else:
                    print(f"Warning: No valid numeric IDs found in {dataset_name} to set target.")
                    target_id_numeric = 1 # Default if no valid samples found
            # Adjust target if needed based on actual data range
        elif dataset_name == 'Dataset Penjualan Buku.csv':
             # Assuming 'Dataset Penjualan Buku.csv' has integer IDs directly mappable
             if id_numeric_values:
                 sample_ids = [id for id in id_numeric_values if id != -1][:5]
                 if sample_ids:
                    target_id_numeric = sample_ids[0]
                    print(f"Using sample target ID_Numeric for {dataset_name}: {target_id_numeric}")
                 else:
                    print(f"Warning: No valid numeric IDs found in {dataset_name} to set target.")
                    target_id_numeric = 1 # Default
        else:
            print(f"Warning: No specific target ID_Numeric defined for {dataset_name}. Using default target 1.")
            target_id_numeric = 1 # Default target for other datasets

        if target_id_numeric is not None:
            start_time = time.time()
            # Perform jump search on the list of numeric IDs
            result_index = jump_search(id_numeric_values, target_id_numeric)
            end_time = time.time()

            if result_index != -1:
                # Retrieve the original dictionary using the index from the sorted numeric list
                original_data_item = sorted_data_list_id_numeric[result_index]
                print(f"ID_Numeric {target_id_numeric} found at index {result_index} in the sorted numeric list.")
                print(f"Corresponding original data item: {original_data_item}")
            else:
                print(f"ID_Numeric {target_id_numeric} not found")
            print(f"Waktu eksekusi Jump Search (by ID_Numeric): {end_time - start_time:.6f} detik")
        else:
            print("Skipping Jump Search (by ID_Numeric) due to no valid target.")

        # 2. Jump-Bubble (by ID_Numeric)
        print("\nMenjalankan Jump-Bubble (by ID_Numeric)...")
        start_time = time.time()
        # Pass a copy of the data_list to jump_bubble_sort
        sorted_data_jb = jump_bubble_sort(data_list.copy(), 'ID_Numeric')
        end_time = time.time()
        # Print the ID_Numeric for the first few elements of the sorted list
        print(f"5 baris pertama setelah pengurutan Jump-Bubble: {[row.get('ID_Numeric', 'N/A') for row in sorted_data_jb[:5]]}")
        print(f"Waktu eksekusi Jump-Bubble: {end_time - start_time:.6f} detik")

        # 3. Jump-Selection (by ID_Numeric)
        print("\nMenjalankan Jump-Selection (by ID_Numeric)...")
        start_time = time.time()
        # Pass a copy of the data_list to jump_selection_sort
        sorted_data_js = jump_selection_sort(data_list.copy(), 'ID_Numeric')
        end_time = time.time()
        # Print the ID_Numeric for the first few elements of the sorted list
        print(f"5 baris pertama setelah pengurutan Jump-Selection: {[row.get('ID_Numeric', 'N/A') for row in sorted_data_js[:5]]}")
        print(f"Waktu eksekusi Jump-Selection: {end_time - start_time:.6f} detik")

    except FileNotFoundError:
        print(f"❌ Gagal memproses dataset {dataset_name}: File tidak ditemukan di lokasi: {dataset_path}")
    except Exception as e:
        print(f"❌ Gagal memproses dataset {dataset_name}: {e}")


# Define all_datasets dictionary as it was in the working notebook context
github_datasets = {
    'penjualan barang.csv': 'https://raw.githubusercontent.com/Anggunsky/datasetPenjualan/main/penjualan%20barang.csv',
    'data_transaksi.csv': 'https://raw.githubusercontent.com/annisareida/Benchmarking-Algoritma/main/data_transaksi.csv',
}
drive_datasets = {
    'synthetic (1).csv': '/content/drive/MyDrive/DATASET/synthetic (1).csv',
    'Dataset Penjualan Buku.csv': '/content/drive/MyDrive/DATASET/Dataset Penjualan Buku.csv',
    'dataset_tiruan.xlsx': '/content/drive/MyDrive/DATASET/dataset_tiruan.xlsx'
}
all_datasets = {**github_datasets, **drive_datasets}


# Jalankan untuk kedua dataset menggunakan paths dari all_datasets
# Make sure the keys used here match the keys in all_datasets
if 'data_transaksi.csv' in all_datasets:
    process_dataset(all_datasets['data_transaksi.csv'], "data_transaksi.csv")
else:
    print("❌ 'data_transaksi.csv' not found in all_datasets.")

if 'Dataset Penjualan Buku.csv' in all_datasets:
    process_dataset(all_datasets['Dataset Penjualan Buku.csv'], "Dataset Penjualan Buku.csv")
else:
    print("❌ 'Dataset Penjualan Buku.csv' not found in all_datasets.")


Memproses data_transaksi.csv
✅ Processed data count: 10000

Menjalankan Jump Search (by ID_Numeric)...
Using sample target ID_Numeric for data_transaksi.csv: 0
ID_Numeric 0 found at index 0 in the sorted numeric list.
Corresponding original data item: {'ID': 'TRX00000', 'Nama Pelanggan': 'OMODF', 'Tanggal Pembelian': '2024-10-28', 'Jumlah Pembelian': 839.53, 'ID_Numeric': 0}
Waktu eksekusi Jump Search (by ID_Numeric): 0.000008 detik

Menjalankan Jump-Bubble (by ID_Numeric)...
5 baris pertama setelah pengurutan Jump-Bubble: [0, 1, 2, 3, 4]
Waktu eksekusi Jump-Bubble: 7.396053 detik

Menjalankan Jump-Selection (by ID_Numeric)...
5 baris pertama setelah pengurutan Jump-Selection: [0, 1, 2, 3, 4]
Waktu eksekusi Jump-Selection: 0.199380 detik

Memproses Dataset Penjualan Buku.csv
✅ Processed data count: 10000

Menjalankan Jump Search (by ID_Numeric)...
Using sample target ID_Numeric for Dataset Penjualan Buku.csv: 1
ID_Numeric 1 found at index 0 in the sorted numeric list.
Corresponding or

# BENCHMARKING YANG DIPERBARUI

In [None]:
# import library di collab
import time
import pandas as pd
import sys
import os
from google.colab import drive
from IPython.display import display

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dataset paths
github_datasets = {
    'penjualan barang.csv': 'https://raw.githubusercontent.com/Anggunsky/datasetPenjualan/main/penjualan%20barang.csv',
    'data_transaksi.csv': 'https://raw.githubusercontent.com/annisareida/Benchmarking-Algoritma/main/data_transaksi.csv',
}
drive_datasets = {
    'synthetic (1).csv': '/content/drive/MyDrive/DATASET/synthetic (1).csv',
    'Dataset Penjualan Buku.csv': '/content/drive/MyDrive/DATASET/Dataset Penjualan Buku.csv',
    'dataset_tiruan.xlsx': '/content/drive/MyDrive/DATASET/dataset_tiruan.xlsx',
    }
all_datasets = {**github_datasets, **drive_datasets}

In [None]:
# FUNCTION UTILITIES
def extract_numeric_id(id_str):
    try:
        if isinstance(id_str, str) and id_str.startswith('TRX'):
            return int(id_str.replace('TRX', ''))
        return int(id_str)  # Untuk ID numerik langsung
    except:
        return -1  # Tangani ID tidak valid

In [None]:
# Fungsi Sequence Search
def sequence_search(data, key, target):
    for i, item in enumerate(data):
        if item.get(key) == target:
            return [item]  # Return as list for consistency
    return []

In [None]:
# Fungsi Binary Search
def binary_search(data, key, target):
    left, right = 0, len(data) - 1
    while left <= right:
        mid = (left + right) // 2
        if data[mid].get(key) == target:
            return [data[mid]]  # Return as list for consistency
        elif data[mid].get(key) < target:
            left = mid + 1
        else:
            right = mid - 1
    return []

In [None]:
# Fungsi Bubble Sort
def bubble_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        swapped = False
        for j in range(0, n - i - 1):
            if (data_copy[j].get(key) is not None and
                data_copy[j + 1].get(key) is not None and
                data_copy[j][key] > data_copy[j + 1][key]):
                data_copy[j], data_copy[j + 1] = data_copy[j + 1], data_copy[j]
                swapped = True
        if not swapped:
            break
    return data_copy

In [None]:
# Fungsi Selection Sort
def selection_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        min_idx = i
        for j in range(i + 1, n):
            if (data_copy[j].get(key) is not None and
                data_copy[min_idx].get(key) is not None and
                data_copy[j][key] < data_copy[min_idx][key]):
                min_idx = j
        data_copy[i], data_copy[min_idx] = data_copy[min_idx], data_copy[i]
    return data_copy

In [None]:
# Fungsi Sequence-Bubble (Bubble Sort dengan Sequence Search untuk verifikasi)
def sequence_bubble(data, key, target):
    search_result = sequence_search(data, key, target)
    print(f"Sequence Search sebelum Bubble Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return bubble_sort(data, key)

In [None]:
# Fungsi Sequence-Selection (Selection Sort dengan Sequence Search untuk verifikasi)
def sequence_selection(data, key, target):
    search_result = sequence_search(data, key, target)
    print(f"Sequence Search sebelum Selection Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return selection_sort(data, key)

In [None]:
# Fungsi Binary-Bubble (Bubble Sort dengan Binary Search untuk verifikasi)
def binary_bubble(data, key, target):
    sorted_data = sorted(data, key=lambda x: x.get(key, 0))
    search_result = binary_search(sorted_data, key, target)
    print(f"Binary Search sebelum Bubble Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return bubble_sort(data, key)

In [None]:
# Fungsi Binary-Selection (Selection Sort dengan Binary Search untuk verifikasi)
def binary_selection(data, key, target):
    sorted_data = sorted(data, key=lambda x: x.get(key, 0))
    search_result = binary_search(sorted_data, key, target)
    print(f"Binary Search sebelum Selection Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return selection_sort(data, key)

In [None]:
def memory_complexity(data):
    return sys.getsizeof(data)

In [None]:
# BENCHMARKING FUNCTION
def benchmark(transactions):
    print("\n==================== PENGUJIAN ALGORITMA PENCARIAN DAN PENGURUTAN ====================")

    keys = ['ID_Numeric', 'Jumlah Pembelian', 'Nama Pelanggan', 'Tanggal Pembelian']
    search_targets = {
        'ID_Numeric': 1,
        'Jumlah Pembelian': 1000.0,  # Contoh, sesuaikan dengan data
        'Nama Pelanggan': 'TOKO HERUNIAWATI',  # Contoh, sesuaikan
        'Tanggal Pembelian': '01/01/2020'  # Contoh, sesuaikan
    }

    for key in keys:
        print(f"\n--- Benchmarking untuk Kolom: {key} ---")

        # Sequence Search
        print(f"\n[1] Sequence Search (by {key})...")
        start = time.time()
        result = sequence_search(transactions, key, search_targets[key])
        end = time.time()
        print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
        print(f"Waktu Sequence Search: {end - start:.6f} detik")
        print(f"Ruang Memori: {memory_complexity(transactions)} bytes")

        # Binary Search (hanya untuk kolom numerik)
        if key in ['ID_Numeric', 'Jumlah Pembelian']:
            print(f"\n[2] Binary Search (by {key})...")
            try:
                sorted_txn = sorted(transactions, key=lambda x: x.get(key, 0))
                start = time.time()
                result = binary_search(sorted_txn, key, search_targets[key])
                end = time.time()
                print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
                print(f"Waktu Binary Search: {end - start:.6f} detik")
                print(f"Ruang Memori: {memory_complexity(sorted_txn)} bytes")
            except TypeError as e:
                print(f"⚠️ Binary Search (by {key}) gagal karena masalah tipe data: {e}")
        else:
            print(f"\n[2] Binary Search (by {key}) dilewati karena kolom bukan numerik")

        # Bubble Sort
        print(f"\n[3] Bubble Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = bubble_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Bubble Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Bubble Sort (by {key}) gagal karena masalah tipe data: {e}")

        # Selection Sort
        print(f"\n[4] Selection Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = selection_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Selection Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Selection Sort (by {key}) gagal karena masalah tipe data: {e}")

        # Sequence-Bubble
        print(f"\n[5] Sequence-Bubble (by {key})...")
        try:
            start = time.time()
            sorted_data = sequence_bubble(transactions, key, search_targets[key])
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Sequence-Bubble: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Sequence-Bubble (by {key}) gagal karena masalah tipe data: {e}")

        # Sequence-Selection
        print(f"\n[6] Sequence-Selection (by {key})...")
        try:
            start = time.time()
            sorted_data = sequence_selection(transactions, key, search_targets[key])
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Sequence-Selection: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Sequence-Selection (by {key}) gagal karena masalah tipe data: {e}")

        # Binary-Bubble (hanya untuk kolom numerik)
        if key in ['ID_Numeric', 'Jumlah Pembelian']:
            print(f"\n[7] Binary-Bubble (by {key})...")
            try:
                start = time.time()
                sorted_data = binary_bubble(transactions, key, search_targets[key])
                end = time.time()
                print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
                print(f"Waktu Binary-Bubble: {end - start:.6f} detik")
                print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
            except TypeError as e:
                print(f"⚠️ Binary-Bubble (by {key}) gagal karena masalah tipe data: {e}")
        else:
            print(f"\n[7] Binary-Bubble (by {key}) dilewati karena kolom bukan numerik")

        # Binary-Selection (hanya untuk kolom numerik)
        if key in ['ID_Numeric', 'Jumlah Pembelian']:
            print(f"\n[8] Binary-Selection (by {key})...")
            try:
                start = time.time()
                sorted_data = binary_selection(transactions, key, search_targets[key])
                end = time.time()
                print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
                print(f"Waktu Binary-Selection: {end - start:.6f} detik")
                print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
            except TypeError as e:
                print(f"⚠️ Binary-Selection (by {key}) gagal karena masalah tipe data: {e}")
        else:
            print(f"\n[8] Binary-Selection (by {key}) dilewati karena kolom bukan numerik")

In [None]:
def normalize_columns(df, filename):
    # Column mappings berdasarkan kolom asli
    if filename == 'penjualan barang.csv':
        mapping = {
            'Unnamed: 0': 'ID',
            'nama.pembeli': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'nominal': 'Jumlah Pembelian'
        }
        df['Unnamed: 0'] = pd.to_numeric(df.get('Unnamed: 0', pd.Series()), errors='coerce').astype('Int64')
        df['nominal'] = pd.to_numeric(df.get('nominal', pd.Series()), errors='coerce')
    elif filename == 'data_transaksi.csv':
        mapping = {
            'id': 'ID',
            'nama': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'jumlah': 'Jumlah Pembelian'
        }
        df['id'] = df.get('id', pd.Series()).apply(extract_numeric_id)
        df['jumlah'] = pd.to_numeric(df.get('jumlah', pd.Series()), errors='coerce')
    elif filename == 'synthetic (1).csv':
        mapping = {
            'ID Pelanggan': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Waktu Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        df['ID Pelanggan'] = pd.to_numeric(df.get('ID Pelanggan', pd.Series()), errors='coerce').astype('Int64')
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    elif filename == 'Dataset Penjualan Buku.csv':
        mapping = {
            'ID': 'ID',
            'Nama_Pelanggan': 'Nama Pelanggan',
            'Tanggal_Pembelian': 'Tanggal Pembelian',
            'Jumlah_Pembelian': 'Jumlah Pembelian'
        }
        df['ID'] = df.get('ID', pd.Series()).apply(extract_numeric_id)
        df['Jumlah_Pembelian'] = pd.to_numeric(df.get('Jumlah_Pembelian', pd.Series()), errors='coerce')
    elif filename == 'dataset_tiruan.xlsx':
        mapping = {
            'ID': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Tanggal Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        df['ID'] = pd.to_numeric(df.get('ID', pd.Series()), errors='coerce').astype('Int64')
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    else:
        mapping = {}
        print(f"ℹ️ Tidak ada mapping kolom khusus untuk {filename}. Menggunakan kolom asli.")

    if mapping:
        valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
        if valid_mapping:
            df = df.rename(columns=valid_mapping)
        else:
            print(f"⚠️ Tidak ada kolom yang cocok dengan mapping untuk {filename}. Kolom yang ada: {list(df.columns)}")

    # Tambahkan kolom ID_Numeric
    if 'ID' in df.columns:
        df['ID_Numeric'] = df['ID'].apply(extract_numeric_id)
    return df

In [None]:
# MAIN
if __name__ == "__main__":
    print("Direktori kerja saat ini:", os.getcwd())
    print("File di direktori kerja:", os.listdir(os.getcwd()))
    drive_path = '/content/drive/MyDrive/DATASET'
    if os.path.exists(drive_path):
        print("File di /content/drive/MyDrive/DATASET:", os.listdir(drive_path))
    else:
        print("Folder /content/drive/MyDrive/DATASET tidak ditemukan")

    for filename, path in all_datasets.items():
        print(f"\n================== MEMUAT DATASET: {filename} ==================")
        try:
            # Periksa keberadaan file untuk path lokal
            if not path.startswith('http'):
                if not os.path.exists(path):
                    print(f"❌ File {filename} tidak ditemukan di: {path}")
                    print("Pastikan file ada di folder /content/drive/MyDrive/DATASET/")
                    continue

            # Baca dataset
            if path.endswith('.xlsx'):
                df = pd.read_excel(path)
            else:
                df = pd.read_csv(path)

            df.columns = [col.strip() for col in df.columns]
            print(f"Kolom asli: {list(df.columns)}")
            df = normalize_columns(df, filename)
            expected_columns = ['ID', 'Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian', 'ID_Numeric']

            # Pilih kolom yang diperlukan, hindari duplikasi
            available_columns = [col for col in expected_columns if col in df.columns]
            if len(available_columns) < 4:  # Minimal harus ada ID, Nama Pelanggan, Tanggal Pembelian, Jumlah Pembelian
                print(f"❌ Dataset {filename} tidak memiliki kolom yang cukup. Kolom yang hilang: {[col for col in expected_columns if col not in df.columns]}")
                print(f"Kolom yang ada: {list(df.columns)}")
                continue
            df = df[available_columns].copy()

            # Konversi tipe data
            df['ID_Numeric'] = pd.to_numeric(df['ID_Numeric'], errors='coerce').astype('Int64')
            df['Jumlah Pembelian'] = pd.to_numeric(df['Jumlah Pembelian'], errors='coerce')
            if filename == 'data_transaksi.csv':
                df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], format='%Y-%m-%d', errors='coerce').dt.strftime('%d/%m/%Y')
            else:
                df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], errors='coerce').dt.strftime('%d/%m/%Y')

            # Log baris dengan nilai kosong
            missing_rows = df[df[available_columns].isna().any(axis=1)]
            if not missing_rows.empty:
                print(f"⚠️ {len(missing_rows)} baris dengan nilai kosong di kolom kunci: {missing_rows.head().to_dict(orient='records')}")

            # Hapus baris dengan nilai kosong
            df = df.dropna(subset=available_columns)

            transaksi_data = df.to_dict(orient='records')
            print(f"✅ Jumlah Data: {len(transaksi_data)}")
            print("Contoh Data (5 data pertama):")
            for txn in transaksi_data[:5]:
                print(txn)

            if transaksi_data:
                benchmark(transaksi_data)
            else:
                print(f"⚠️ Tidak ada data transaksi yang valid di {filename} setelah pemrosesan. Melewati benchmark.")
        except FileNotFoundError:
            print(f"❌ Gagal memproses dataset {filename}: File tidak ditemukan di lokasi: {path}")
        except Exception as e:
            print(f"❌ Gagal memproses dataset {filename}: {e}")

Direktori kerja saat ini: /content
File di direktori kerja: ['.config', 'drive', 'sample_data']
File di /content/drive/MyDrive/DATASET: ['Dataset Penjualan Buku.csv', 'dataset_tiruan.xlsx', 'synthetic (1).csv', 'olist_products_dataset.csv', 'olist_orders_dataset.csv']

Kolom asli: ['Unnamed: 0', 'tanggal', 'nama.pembeli', 'nama.barang', 'kuantum', 'nominal']
✅ Jumlah Data: 1289
Contoh Data (5 data pertama):
{'ID': 1, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 9840000.0, 'ID_Numeric': 1}
{'ID': 2, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 8400000.0, 'ID_Numeric': 2}
{'ID': 3, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 62910000.0, 'ID_Numeric': 3}
{'ID': 4, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 4855200.0, 'ID_Numeric': 4}
{'ID': 5, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', '

In [None]:
import time
import pandas as pd
import sys
import os
from google.colab import drive
from IPython.display import display

# Mount Google Drive
drive.mount('/content/drive')

# Dataset paths
github_datasets = {
    'penjualan barang.csv': 'https://raw.githubusercontent.com/Anggunsky/datasetPenjualan/main/penjualan%20barang.csv',
    'data_transaksi.csv': 'https://raw.githubusercontent.com/annisareida/Benchmarking-Algoritma/main/data_transaksi.csv',
}
drive_datasets = {
    'synthetic (1).csv': '/content/drive/MyDrive/DATASET/synthetic (1).csv',
    'Dataset Penjualan Buku.csv': '/content/drive/MyDrive/DATASET/Dataset Penjualan Buku.csv',
    'dataset_tiruan.xlsx': '/content/drive/MyDrive/DATASET/dataset_tiruan.xlsx',
}
all_datasets = {**github_datasets, **drive_datasets}

# FUNCTION UTILITIES
def extract_numeric_id(id_str):
    try:
        if isinstance(id_str, str) and id_str.startswith('TRX'):
            return int(id_str.replace('TRX', ''))
        return int(id_str)  # Untuk ID numerik langsung
    except:
        return -1  # Tangani ID tidak valid

def sequence_search(data, key, target):
    for i, item in enumerate(data):
        if item.get(key) == target:
            return [item]  # Return as list for consistency
    return []

def binary_search(data, key, target):
    left, right = 0, len(data) - 1
    while left <= right:
        mid = (left + right) // 2
        current = data[mid].get(key)
        if current == target:
            return [data[mid]]  # Return as list for consistency
        elif current < target:  # Perbandingan mendukung string dan numerik
            left = mid + 1
        else:
            right = mid - 1
    return []

def bubble_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        swapped = False
        for j in range(0, n - i - 1):
            if (data_copy[j].get(key) is not None and
                data_copy[j + 1].get(key) is not None and
                data_copy[j][key] > data_copy[j + 1][key]):
                data_copy[j], data_copy[j + 1] = data_copy[j + 1], data_copy[j]
                swapped = True
        if not swapped:
            break
    return data_copy

def selection_sort(data, key):
    data_copy = data[:]
    n = len(data_copy)
    for i in range(n):
        min_idx = i
        for j in range(i + 1, n):
            if (data_copy[j].get(key) is not None and
                data_copy[min_idx].get(key) is not None and
                data_copy[j][key] < data_copy[min_idx][key]):
                min_idx = j
        data_copy[i], data_copy[min_idx] = data_copy[min_idx], data_copy[i]
    return data_copy

def sequence_bubble(data, key, target):
    search_result = sequence_search(data, key, target)
    print(f"Sequence Search sebelum Bubble Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return bubble_sort(data, key)

def sequence_selection(data, key, target):
    search_result = sequence_search(data, key, target)
    print(f"Sequence Search sebelum Selection Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return selection_sort(data, key)

def binary_bubble(data, key, target):
    sorted_data = sorted(data, key=lambda x: x.get(key, '') if isinstance(x.get(key), str) else x.get(key, 0))
    search_result = binary_search(sorted_data, key, target)
    print(f"Binary Search sebelum Bubble Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return bubble_sort(data, key)

def binary_selection(data, key, target):
    sorted_data = sorted(data, key=lambda x: x.get(key, '') if isinstance(x.get(key), str) else x.get(key, 0))
    search_result = binary_search(sorted_data, key, target)
    print(f"Binary Search sebelum Selection Sort: {search_result[:5] if search_result else 'Tidak ditemukan'}")
    return selection_sort(data, key)

def memory_complexity(data):
    return sys.getsizeof(data)

# BENCHMARKING FUNCTION
def benchmark(transactions, dataset_name):
    print("\n==================== PENGUJIAN ALGORITMA PENCARIAN DAN PENGURUTAN ====================")

    keys = ['ID_Numeric', 'Jumlah Pembelian', 'Nama Pelanggan', 'Tanggal Pembelian']
    # Target pencarian disesuaikan per dataset
    search_targets = {
        'penjualan barang.csv': {
            'ID_Numeric': 1,
            'Jumlah Pembelian': 9840000.0,  # Dari data
            'Nama Pelanggan': 'TOKO HERUNIAWATI',
            'Tanggal Pembelian': '02/01/2020'
        },
        'data_transaksi.csv': {
            'ID_Numeric': 1,
            'Jumlah Pembelian': 809.47,  # Dari data
            'Nama Pelanggan': 'QDFKU',
            'Tanggal Pembelian': '04/07/2023'
        },
        'synthetic (1).csv': {
            'ID_Numeric': 1,
            'Jumlah Pembelian': 275031,  # Dari data
            'Nama Pelanggan': 'Udin Sari',
            'Tanggal Pembelian': '29/06/2024'
        },
        'Dataset Penjualan Buku.csv': {
            'ID_Numeric': 1,
            'Jumlah Pembelian': 6,  # Dari data
            'Nama Pelanggan': 'Tira Palastri, S.Kom',
            'Tanggal Pembelian': '14/04/2025'
        },
        'dataset_tiruan.xlsx': {
            'ID_Numeric': 1,
            'Jumlah Pembelian': 880,  # Dari data
            'Nama Pelanggan': 'Michael Bishop',
            'Tanggal Pembelian': '08/09/2022'
        }
    }

    for key in keys:
        print(f"\n--- Benchmarking untuk Kolom: {key} ---")
        target = search_targets[dataset_name][key]

        # Sequence Search
        print(f"\n[1] Sequence Search (by {key})...")
        start = time.time()
        result = sequence_search(transactions, key, target)
        end = time.time()
        print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
        print(f"Waktu Sequence Search: {end - start:.6f} detik")
        print(f"Ruang Memori: {memory_complexity(transactions)} bytes")

        # Binary Search
        print(f"\n[2] Binary Search (by {key})...")
        try:
            sorted_txn = sorted(transactions, key=lambda x: x.get(key, '') if isinstance(x.get(key), str) else x.get(key, 0))
            start = time.time()
            result = binary_search(sorted_txn, key, target)
            end = time.time()
            print(f"Hasil: {result[:5]}" if len(result) > 5 else f"Hasil: {result}")
            print(f"Waktu Binary Search: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_txn)} bytes")
        except TypeError as e:
            print(f"⚠️ Binary Search (by {key}) gagal karena masalah tipe data: {e}")

        # Bubble Sort
        print(f"\n[3] Bubble Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = bubble_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Bubble Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Bubble Sort (by {key}) gagal karena masalah tipe data: {e}")

        # Selection Sort
        print(f"\n[4] Selection Sort (by {key})...")
        try:
            start = time.time()
            sorted_data = selection_sort(transactions, key)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Selection Sort: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Selection Sort (by {key}) gagal karena masalah tipe data: {e}")

        # Sequence-Bubble
        print(f"\n[5] Sequence-Bubble (by {key})...")
        try:
            start = time.time()
            sorted_data = sequence_bubble(transactions, key, target)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Sequence-Bubble: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Sequence-Bubble (by {key}) gagal karena masalah tipe data: {e}")

        # Sequence-Selection
        print(f"\n[6] Sequence-Selection (by {key})...")
        try:
            start = time.time()
            sorted_data = sequence_selection(transactions, key, target)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Sequence-Selection: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Sequence-Selection (by {key}) gagal karena masalah tipe data: {e}")

        # Binary-Bubble
        print(f"\n[7] Binary-Bubble (by {key})...")
        try:
            start = time.time()
            sorted_data = binary_bubble(transactions, key, target)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Binary-Bubble: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Binary-Bubble (by {key}) gagal karena masalah tipe data: {e}")

        # Binary-Selection
        print(f"\n[8] Binary-Selection (by {key})...")
        try:
            start = time.time()
            sorted_data = binary_selection(transactions, key, target)
            end = time.time()
            print(f"Hasil (5 data pertama): {[item[key] for item in sorted_data[:5]]}")
            print(f"Waktu Binary-Selection: {end - start:.6f} detik")
            print(f"Ruang Memori: {memory_complexity(sorted_data)} bytes")
        except TypeError as e:
            print(f"⚠️ Binary-Selection (by {key}) gagal karena masalah tipe data: {e}")

def normalize_columns(df, filename):
    if filename == 'penjualan barang.csv':
        mapping = {
            'Unnamed: 0': 'ID',
            'nama.pembeli': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'nominal': 'Jumlah Pembelian'
        }
        df['Unnamed: 0'] = pd.to_numeric(df.get('Unnamed: 0', pd.Series()), errors='coerce').astype('Int64')
        df['nominal'] = pd.to_numeric(df.get('nominal', pd.Series()), errors='coerce')
    elif filename == 'data_transaksi.csv':
        mapping = {
            'id': 'ID',
            'nama': 'Nama Pelanggan',
            'tanggal': 'Tanggal Pembelian',
            'jumlah': 'Jumlah Pembelian'
        }
        df['id'] = df.get('id', pd.Series()).apply(extract_numeric_id)
        df['jumlah'] = pd.to_numeric(df.get('jumlah', pd.Series()), errors='coerce')
    elif filename == 'synthetic (1).csv':
        mapping = {
            'ID Pelanggan': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Waktu Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        df['ID Pelanggan'] = pd.to_numeric(df.get('ID Pelanggan', pd.Series()), errors='coerce').astype('Int64')
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    elif filename == 'Dataset Penjualan Buku.csv':
        mapping = {
            'ID': 'ID',
            'Nama_Pelanggan': 'Nama Pelanggan',
            'Tanggal_Pembelian': 'Tanggal Pembelian',
            'Jumlah_Pembelian': 'Jumlah Pembelian'
        }
        df['ID'] = df.get('ID', pd.Series()).apply(extract_numeric_id)
        df['Jumlah_Pembelian'] = pd.to_numeric(df.get('Jumlah_Pembelian', pd.Series()), errors='coerce')
    elif filename == 'dataset_tiruan.xlsx':
        mapping = {
            'ID': 'ID',
            'Nama Pelanggan': 'Nama Pelanggan',
            'Tanggal Pembelian': 'Tanggal Pembelian',
            'Jumlah Pembelian': 'Jumlah Pembelian'
        }
        df['ID'] = pd.to_numeric(df.get('ID', pd.Series()), errors='coerce').astype('Int64')
        df['Jumlah Pembelian'] = pd.to_numeric(df.get('Jumlah Pembelian', pd.Series()), errors='coerce')
    else:
        mapping = {}
        print(f"ℹ️ Tidak ada mapping kolom khusus untuk {filename}. Menggunakan kolom asli.")

    if mapping:
        valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
        if valid_mapping:
            df = df.rename(columns=valid_mapping)
        else:
            print(f"⚠️ Tidak ada kolom yang cocok dengan mapping untuk {filename}. Kolom yang ada: {list(df.columns)}")

    if 'ID' in df.columns:
        df['ID_Numeric'] = df['ID'].apply(extract_numeric_id)
    return df

# MAIN
if __name__ == "__main__":
    print("Direktori kerja saat ini:", os.getcwd())
    print("File di direktori kerja:", os.listdir(os.getcwd()))
    drive_path = '/content/drive/MyDrive/DATASET'
    if os.path.exists(drive_path):
        print("File di /content/drive/MyDrive/DATASET:", os.listdir(drive_path))
    else:
        print("Folder /content/drive/MyDrive/DATASET tidak ditemukan")

    for filename, path in all_datasets.items():
        print(f"\n================== MEMUAT DATASET: {filename} ==================")
        try:
            if not path.startswith('http'):
                if not os.path.exists(path):
                    print(f"❌ File {filename} tidak ditemukan di: {path}")
                    print("Pastikan file ada di folder /content/drive/MyDrive/DATASET/")
                    continue

            if path.endswith('.xlsx'):
                df = pd.read_excel(path)
            else:
                df = pd.read_csv(path)

            df.columns = [col.strip() for col in df.columns]
            print(f"Kolom asli: {list(df.columns)}")
            df = normalize_columns(df, filename)
            expected_columns = ['ID', 'Nama Pelanggan', 'Tanggal Pembelian', 'Jumlah Pembelian', 'ID_Numeric']

            available_columns = [col for col in expected_columns if col in df.columns]
            if len(available_columns) < 4:
                print(f"❌ Dataset {filename} tidak memiliki kolom yang cukup. Kolom yang hilang: {[col for col in expected_columns if col not in df.columns]}")
                print(f"Kolom yang ada: {list(df.columns)}")
                continue
            df = df[available_columns].copy()

            df['ID_Numeric'] = pd.to_numeric(df['ID_Numeric'], errors='coerce').astype('Int64')
            df['Jumlah Pembelian'] = pd.to_numeric(df['Jumlah Pembelian'], errors='coerce')
            if filename == 'data_transaksi.csv':
                df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], format='%Y-%m-%d', errors='coerce').dt.strftime('%d/%m/%Y')
            else:
                df['Tanggal Pembelian'] = pd.to_datetime(df['Tanggal Pembelian'], errors='coerce').dt.strftime('%d/%m/%Y')

            missing_rows = df[df[available_columns].isna().any(axis=1)]
            if not missing_rows.empty:
                print(f"⚠️ {len(missing_rows)} baris dengan nilai kosong di kolom kunci: {missing_rows.head().to_dict(orient='records')}")

            df = df.dropna(subset=available_columns)

            transaksi_data = df.to_dict(orient='records')
            print(f"✅ Jumlah Data: {len(transaksi_data)}")
            print("Contoh Data (5 data pertama):")
            for txn in transaksi_data[:5]:
                print(txn)

            if transaksi_data:
                benchmark(transaksi_data, filename)
            else:
                print(f"⚠️ Tidak ada data transaksi yang valid di {filename} setelah pemrosesan. Melewati benchmark.")
        except FileNotFoundError:
            print(f"❌ Gagal memproses dataset {filename}: File tidak ditemukan di lokasi: {path}")
        except Exception as e:
            print(f"❌ Gagal memproses dataset {filename}: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Direktori kerja saat ini: /content
File di direktori kerja: ['.config', 'drive', 'sample_data']
File di /content/drive/MyDrive/DATASET: ['Dataset Penjualan Buku.csv', 'dataset_tiruan.xlsx', 'synthetic (1).csv', 'olist_products_dataset.csv', 'olist_orders_dataset.csv']

Kolom asli: ['Unnamed: 0', 'tanggal', 'nama.pembeli', 'nama.barang', 'kuantum', 'nominal']
✅ Jumlah Data: 1289
Contoh Data (5 data pertama):
{'ID': 1, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 9840000.0, 'ID_Numeric': 1}
{'ID': 2, 'Nama Pelanggan': 'TOKO HERUNIAWATI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 8400000.0, 'ID_Numeric': 2}
{'ID': 3, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal Pembelian': '02/01/2020', 'Jumlah Pembelian': 62910000.0, 'ID_Numeric': 3}
{'ID': 4, 'Nama Pelanggan': 'TOKO APRILIA SUKRISNI', 'Tanggal

In [None]:
'olist_orders_dataset.csv': '/content/drive/MyDrive/DATASET/olist_orders_dataset.csv',
'olist_products_dataset.csv': '/content/drive/MyDrive/DATASET/olist_products_dataset.csv'
