## 1. Setup and Imports

In [1]:
import os
import math
import pandas as pd
import data_loading as dl
import plotly.graph_objects as go
import numpy as np

## 2. Data Loading

In [2]:
# Configuration
folder_name = 'duze'
base_path = f'dane/{folder_name}/1'

# Load data using your custom loader
print(f"Loading data from: {base_path}/{folder_name}")
file_data, all_attrs = dl.load_files(os.path.join(base_path), folder_name)

# Convert attribute names to strings
all_attrs = [str(attr) for attr in all_attrs]
D = len(all_attrs)  # Total number of attributes

print(f"Loaded {len(file_data)} files")
for i, rows in enumerate(file_data):
    print(f"  File {i}: {len(rows)} rows")
print(f"Total attributes: {D}")

Loading data from: dane/duze/1/duze
Loaded 10 files
  File 0: 7888 rows
  File 1: 7816 rows
  File 2: 6463 rows
  File 3: 5128 rows
  File 4: 5423 rows
  File 5: 6594 rows
  File 6: 6044 rows
  File 7: 6700 rows
  File 8: 7552 rows
  File 9: 6281 rows
Total attributes: 17


## 3. Data Preparation

In [3]:
# Create a list to hold preprocessed data for each file
preprocessed_files = []

for file_idx, rows in enumerate(file_data):
    file_rows = []
    for row_idx, row_dict in enumerate(rows):
        # Convert keys to strings for consistency
        str_row_dict = {str(k): v for k, v in row_dict.items()}
        file_rows.append(str_row_dict)
    preprocessed_files.append(file_rows)

# Create global list of all rows with identifiers
all_rows = []
for file_idx, rows in enumerate(preprocessed_files):
    for row_idx, row_dict in enumerate(rows):
        all_rows.append((file_idx, row_idx, row_dict))

# Create lookup dictionary for faster access
row_lookup = {(file_idx, row_idx): row_dict for file_idx, row_idx, row_dict in all_rows}

print(f"Preprocessed {len(all_rows)} rows across {len(preprocessed_files)} files")

Preprocessed 65889 rows across 10 files


## 4. Nearest Neighbors Calculation (this may take a while)

In [4]:
import numpy as np
import numba
import math

# Convert data to Numba-friendly format
def prepare_data(all_rows, all_attrs):
    n = len(all_rows)
    D = len(all_attrs)
    attr_to_idx = {attr: idx for idx, attr in enumerate(all_attrs)}

    # Create matrix of values and mask of present attributes
    X = np.full((n, D), np.nan)
    present_mask = np.zeros((n, D), dtype=np.bool_)

    for i, (_, _, row_dict) in enumerate(all_rows):
        for col, val in row_dict.items():
            if col in attr_to_idx:
                try:
                    col_idx = attr_to_idx[col]
                    X[i, col_idx] = float(val)
                    present_mask[i, col_idx] = True
                except (ValueError, TypeError):
                    pass
    return X, present_mask, D

@numba.njit(parallel=True)
def compute_neighbors(X, present_mask, D, k=5):
    n = X.shape[0]
    all_top_dists = np.full((n, k), np.inf)
    all_top_indices = np.full((n, k), -1, dtype=np.int32)

    for i in numba.prange(n):
        # Initialize min-heap storage for top neighbors
        top_dists = np.full(k, np.inf)
        top_indices = np.full(k, -1, dtype=np.int32)

        for j in range(n):
            if i == j:
                continue

            # Find common present columns
            common_mask = present_mask[i] & present_mask[j]
            valid_count = np.sum(common_mask)

            if valid_count == 0:
                continue

            # Compute partial Euclidean distance
            sq_sum = 0.0
            for col in range(D):
                if common_mask[col]:
                    diff = X[i, col] - X[j, col]
                    sq_sum += diff * diff

            partial_dist = math.sqrt(sq_sum)
            dist = partial_dist * math.sqrt(D / valid_count)

            # Maintain top k smallest distances
            if dist < top_dists[-1]:
                # Replace largest element
                top_dists[-1] = dist
                top_indices[-1] = j

                # Bubble down to maintain heap property
                idx = k - 1
                while idx > 0 and top_dists[idx] < top_dists[idx-1]:
                    # Swap with parent
                    top_dists[idx], top_dists[idx-1] = top_dists[idx-1], top_dists[idx]
                    top_indices[idx], top_indices[idx-1] = top_indices[idx-1], top_indices[idx]
                    idx -= 1

        # Store sorted results
        all_top_dists[i] = top_dists
        all_top_indices[i] = top_indices

    return all_top_dists, all_top_indices

# Usage
X, present_mask, D = prepare_data(all_rows, all_attrs)
all_top_dists, all_top_indices = compute_neighbors(X, present_mask, D, k=5)

# Convert to original format
nearestNeighbors = []
for i in range(len(all_rows)):
    file_i, row_i, _ = all_rows[i]
    neighbors = []
    for dist, j in zip(all_top_dists[i], all_top_indices[i]):
        if j == -1:  # Skip invalid indices
            continue
        file_j, row_j, _ = all_rows[j]
        neighbors.append((dist, file_j, row_j))

    nearestNeighbors.append({
        'source': (file_i, row_i),
        'neighbors': neighbors
    })

## 5. Data Imputation (using neighbor averages)

In [6]:
# Create a list to hold imputed data for each file
imputed_files = []

for file_idx, rows in enumerate(preprocessed_files):
    print(f"Imputing file {file_idx}...")
    imputed_file_rows = []

    for row_idx, row_dict in enumerate(rows):
        # Find this row in nearestNeighbors
        row_index_in_all = sum(len(f) for f in preprocessed_files[:file_idx]) + row_idx
        neighbor_info = nearestNeighbors[row_index_in_all]
        source_row = row_dict.copy()  # Start with original row

        # Find missing attributes
        missing_keys = set(all_attrs) - set(source_row.keys())

        # Impute each missing attribute using neighbors
        for key in missing_keys:
            values = []
            for dist, file_j, row_j in neighbor_info['neighbors']:
                neighbor_row = row_lookup.get((file_j, row_j), {})
                if key in neighbor_row:
                    try:
                        val = float(neighbor_row[key])
                        values.append(val)
                    except (ValueError, TypeError):
                        continue  # Skip non-numeric values

            if values:  # Only impute if we found valid values
                source_row[key] = sum(values) / len(values)

        imputed_file_rows.append(source_row)

    imputed_files.append(imputed_file_rows)
    print(f"  File {file_idx} imputed: {len(imputed_file_rows)} rows")

Imputing file 0...
  File 0 imputed: 7888 rows
Imputing file 1...
  File 1 imputed: 7816 rows
Imputing file 2...
  File 2 imputed: 6463 rows
Imputing file 3...
  File 3 imputed: 5128 rows
Imputing file 4...
  File 4 imputed: 5423 rows
Imputing file 5...
  File 5 imputed: 6594 rows
Imputing file 6...
  File 6 imputed: 6044 rows
Imputing file 7...
  File 7 imputed: 6700 rows
Imputing file 8...
  File 8 imputed: 7552 rows
Imputing file 9...
  File 9 imputed: 6281 rows


## 6. Create DataFrames (One Per File)

In [7]:
# Create a list to hold DataFrames for each file
file_dfs = []

for file_idx, rows in enumerate(imputed_files):
    # Create DataFrame for this file
    df_file = pd.DataFrame(rows)

    # Ensure all attributes are present as columns
    for attr in all_attrs:
        if attr not in df_file.columns:
            df_file[attr] = np.nan

    # Reorder columns to match all_attrs
    df_file = df_file[all_attrs]

    file_dfs.append(df_file)

    print(f"File {file_idx} DataFrame created: {df_file.shape}")

File 0 DataFrame created: (7888, 17)
File 1 DataFrame created: (7816, 17)
File 2 DataFrame created: (6463, 17)
File 3 DataFrame created: (5128, 17)
File 4 DataFrame created: (5423, 17)
File 5 DataFrame created: (6594, 17)
File 6 DataFrame created: (6044, 17)
File 7 DataFrame created: (6700, 17)
File 8 DataFrame created: (7552, 17)
File 9 DataFrame created: (6281, 17)


## 7. Interactive Visualization
Displaying interactive tables for each file

In [8]:
for file_idx, df_file in enumerate(file_dfs):

    # Create Plotly table for top rows
    rows_to_show = 10
    df_display = df_file.head(rows_to_show)

    fig = go.Figure(
        data=[go.Table(
            header=dict(
                values=list(df_display.columns),
                fill_color='#1f77b4',  # Matplotlib blue
                align='left',
                font=dict(color='white', size=12)
            ),
            cells=dict(
                values=[df_display[col] for col in df_display.columns],
                fill=dict(color=['rgb(245,245,245)', 'rgb(230,230,230)']),  # Alternate row colors
                align='left',
                font=dict(color='black', size=11),
                height=30
            )
        )]
    )

    # Update layout
    fig.update_layout(
        title=f'Imputed Data - File {file_idx} - Shape:{df_file.shape}',
        title_font=dict(size=18),
        margin=dict(l=10, r=10, t=60, b=10),
        height= 80 + 30 * (len(df_display) + 1)  # Dynamic height
    )

    # Display in notebook
    fig.show()