чтение и анализ логов бэнчмарка

# Импорты

In [133]:
import re
from pathlib import Path
from pprint import pprint
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly import colors
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from src.utils.read_logs import LogReader

In [134]:
plt.ioff()
%matplotlib notebook

# Чтение и преобразование логов

In [135]:
log_file_path = "logs/2025-06-25 method_logs.json"
logs = LogReader.load_logs_from_file(log_file_path)

In [136]:
len(logs)

10158

In [137]:
logs[0]

{'method_name': 'TensorLy_Tucker_image-0_pytorch_truncated_svd_svd',
 'method_args': {'rank': [252, 255, 3],
  'n_iter_max': 100,
  'init': 'svd',
  'svd': 'truncated_svd',
  'random_state': 42},
 'qualitative_metrics': {'Language': 'Python',
  'Library': 'TensorLy',
  'TensorLy backend': 'pytorch',
  'Tensor type': 'Dense',
  'Data type': 'image-0',
  'Platform': 'CPU, GPU',
  'Decomposition method': 'Tucker'},
 'quantitative_metrics': {'gpu_allocated_memory_used_mb': [23.048828125,
   19.67822265625,
   19.55224609375,
   19.67822265625,
   19.703125],
  'ram_mem_used_mb': [6175.27734375, 6184.375, 6184.375, 6184.375, 6184.375],
  'duration': [4.248620055000174,
   1.7748750900000232,
   1.7735690550000527,
   1.7700060310000936,
   1.9719900439999947],
  'frobenius_error': [0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604],
  'compression_ratio': [50.166930737890446,
   50.166930737890446,
   50.166930737890446,
   50.166

In [138]:
clean_rows, anom_rows, error_rows = [], [], []
neg_stats = defaultdict(lambda: {"idx": [], "vals": []})  # статистика по аномалиям

metric_keys = [  # порядок важен для синхронного среза
    "gpu_allocated_memory_used_mb",
    "ram_mem_used_mb",
    "duration",
    "frobenius_error",
    "compression_ratio",
]

for log in logs:
    # --- базовая (qualitative) часть ---------------------------------
    base = {
        "method_name": log["method_name"],
        "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
        "data_type": log["qualitative_metrics"]["Data type"],
        "language": log["qualitative_metrics"]["Language"],
        "library": log["qualitative_metrics"]["Library"],
        "tensor_type": log["qualitative_metrics"]["Tensor type"],
        "platform": log["qualitative_metrics"]["Platform"],
        "backend": log["qualitative_metrics"][
            next(k for k in log["qualitative_metrics"] if "backend" in k)
        ],
    }

    # ----- случаи с ошибкой выполнения --------------------------------
    if log["error_message"]:
        error_rows.append(
            {
                **base,
                "error_message": log["error_message"],
                "is_error": True,
            }
        )
        continue
    else:
        error_rows.append(
            {
                **base,
                "error_message": log["error_message"],
                "is_error": False,
            }
        )

    q = log["quantitative_metrics"]
    n_runs = max(len(q[k]) for k in metric_keys)  # обычно 5

    # 1. определяем, какие индексы аномальны
    bad_idx = {i for k in metric_keys
               for i, v in enumerate(q[k])
               if isinstance(v, (int, float)) and v < 0}

    # 2. «чистые» и «плохие» индексы
    good_idx = sorted(set(range(n_runs)) - bad_idx)
    bad_idx = sorted(bad_idx)


    # функция-хелпер: формирует агрегаты для выбранного подмножества индексов
    def make_row(idx_subset, tag):
        if not idx_subset:
            return None  # нечего добавлять
        row = {**base,
               "run_type": tag,  # clean | anomaly
               "original_run_indices": idx_subset}  # список позиций
        for k in metric_keys:
            vals = [q[k][i] for i in idx_subset]
            row[f"{k}_min"] = np.min(vals)
            row[f"{k}_max"] = np.max(vals)
            row[f"{k}_mean"] = np.mean(vals)
            row[f"{k}_median"] = np.median(vals)
        return row


    # 3. сохраняем строки
    r_clean = make_row(good_idx, "clean")
    r_anom = make_row(bad_idx, "anomaly")

    if r_clean: clean_rows.append(r_clean)
    if r_anom:
        anom_rows.append(r_anom)
        # для общей статистики: сколько раз каждая позиция «проваливается»
        for k in metric_keys:
            neg_stats[k]['idx'].extend([i for i in bad_idx
                                        if isinstance(q[k][i], (int, float)) and q[k][i] < 0])
            neg_stats[k]['vals'].extend([q[k][i] for i in bad_idx
                                         if isinstance(q[k][i], (int, float)) and q[k][i] < 0])

# --- превращаем во фреймы --------------------------------------------
df_clean_logs = pd.DataFrame(clean_rows)  # положительные значения прогонов
df_anomaly_logs = pd.DataFrame(anom_rows)  # отрицательные значения прогонов
df_logs_errors = pd.DataFrame(error_rows)  # все логи без количественных метрик

In [139]:
print("metric                         | count_neg |    min   |   mean  |   max   | median ")
print("-" * 100)

for metric, d in neg_stats.items():
    if not d["vals"]:
        continue

    print(
        f"{metric:<30} | "
        f"{len(d['vals']):>9} | "
        f"{np.min(d['vals']):^8.4f} | "
        f"{np.mean(d['vals']):>6.4f} | "
        f"{np.max(d['vals']):>6.4f} | "
        f"{np.median(d['vals']):>6.4f}"
    )


metric                         | count_neg |    min   |   mean  |   max   | median 
----------------------------------------------------------------------------------------------------


In [140]:
# log_data = []
# log_errors = []
# 
# negative_stats = defaultdict(lambda: {
#     "count": 0,
#     "min": float("inf"),
#     "max": float("-inf"),
#     "sum": 0.0
# })
# 
# for log in logs:
#     if log["error_message"] == "":
#         try:
#             data_entry = {
#                 # Method name
#                 "method_name": log["method_name"],
#                 # Some Qualitative Data
#                 "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
#                 "data_type": log["qualitative_metrics"]["Data type"],
#                 "language": log["qualitative_metrics"]["Language"],
#                 "library": log["qualitative_metrics"]["Library"],
#                 "tensor_type": log["qualitative_metrics"]["Tensor type"],
#                 "platform": log["qualitative_metrics"]["Platform"],
#                 "backend": log["qualitative_metrics"].get(
#                     next(key for key in log["qualitative_metrics"] if "backend" in key)
#                 ),
#                 # GPU Allocated Memory
#                 "gpu_allocated_memory_used_min": np.min(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used_max": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 # GPU Cached Memory
#                 "gpu_cached_memory_used_min": np.min(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used_max": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 # RAM Memory Usage
#                 "ram_mem_used_min": np.min([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used_max": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used_mean": np.mean([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 # Duration
#                 "duration_min": np.min([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration_max": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration_mean": np.mean([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 # Frobenius Error
#                 "frobenius_error_min": np.min([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 "frobenius_error_max": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 "frobenius_error_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]
#                 ),
#                 "frobenius_error": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 # Compression Ratio
#                 "compression_ratio_min": np.min(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio_max": np.max(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio": np.max([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
#             }
#             log_data.append(data_entry)
# 
#             for key, values in log["quantitative_metrics"].items():
#                 if not isinstance(values, list):
#                     continue
# 
#                 neg_values = [v for v in values if isinstance(v, (int, float)) and v < 0]
#                 if neg_values:
#                     print(f"[{log['method_name']}] Key '{key}' has negative values: {neg_values}")
# 
#                     negative_stats[key]["count"] += len(neg_values)
#                     negative_stats[key]["min"] = min(negative_stats[key]["min"], min(neg_values))
#                     negative_stats[key]["max"] = max(negative_stats[key]["max"], max(neg_values))
#                     negative_stats[key]["sum"] += sum(neg_values)
# 
# 
#         except Exception as e:
#             print(f"Exception with {log['method_name']}. Error: {e!s}")
#     else:
#         data_entry = {
#             # method name
#             "method_name": log["method_name"],
#             # qualitative metrics
#             "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
#             "data_type": log["qualitative_metrics"]["Data type"],
#             "language": log["qualitative_metrics"]["Language"],
#             "library": log["qualitative_metrics"]["Library"],
#             "tensor_type": log["qualitative_metrics"]["Tensor type"],
#             "platform": log["qualitative_metrics"]["Platform"],
#             "backend": log["qualitative_metrics"].get(
#                 next(key for key in log["qualitative_metrics"] if "backend" in key)
#             ),
#             # Error data
#             "error_message": log["error_message"],
#         }
#         log_errors.append(data_entry)
# 
# df_logs = pd.DataFrame(log_data)
# df_logs_errors = pd.DataFrame(log_errors)

In [141]:
# print("\n=== Summary of Negative Values in Quantitative Metrics ===")
# for key, stats in negative_stats.items():
#     count = stats["count"]
#     if count > 0:
#         avg = stats["sum"] / count
#         print(f"Key: {key}")
#         print(f"  → Count   : {count}")
#         print(f"  → Min     : {stats['min']}")
#         print(f"  → Max     : {stats['max']}")
#         print(f"  → Mean    : {avg:.4f}\n")

In [142]:
df_clean_logs.shape

(4772, 30)

In [143]:
# for col in df_clean_logs.columns:
#     try:
#         unique_vals = df_clean_logs[col].unique()
#         print(f"Column: {col}")
#         print(f"Unique values ({len(unique_vals)}): {unique_vals}")
#         print("-" * 60)
#     except TypeError as e:
#         print(f"Skipped column '{col}' due to unhashable data type: {e}")

In [144]:
df_clean_logs

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,run_type,original_run_indices,...,duration_mean,duration_median,frobenius_error_min,frobenius_error_max,frobenius_error_mean,frobenius_error_median,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio_median
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.307812,1.774875,0.672185,0.672185,0.672185,0.672185,50.166931,50.166931,50.166931,50.166931
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.658001,2.638634,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.753213,2.702102,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
3,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,1.373213,1.388822,0.672679,0.672679,0.672679,0.672679,50.166931,50.166931,50.166931,50.166931
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.717210,2.757429,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4767,T3F_TensorTrain_image-1,TensorTrain,image-1,Python,T3F,Dense,"CPU, GPU",tensorflow,clean,"[0, 1, 2, 3, 4]",...,0.408760,0.386177,3.427543,3.427543,3.427543,3.427543,50.229930,50.229930,50.229930,50.229930
4768,T3F_TensorTrain_image-2,TensorTrain,image-2,Python,T3F,Dense,"CPU, GPU",tensorflow,clean,"[0, 1, 2, 3, 4]",...,0.479950,0.481683,0.000883,0.000883,0.000883,0.000883,50.006579,50.006579,50.006579,50.006579
4769,T3F_TensorTrain_video-0,TensorTrain,video-0,Python,T3F,Dense,"CPU, GPU",tensorflow,clean,"[0, 1, 2, 3, 4]",...,0.603289,0.569417,0.338334,0.338334,0.338334,0.338334,50.340601,50.340601,50.340601,50.340601
4770,T3F_TensorTrain_video-1,TensorTrain,video-1,Python,T3F,Dense,"CPU, GPU",tensorflow,clean,"[0, 1, 2, 3, 4]",...,0.462598,0.436068,0.477660,0.477660,0.477660,0.477660,50.094968,50.094968,50.094968,50.094968


In [145]:
df_anomaly_logs.shape

(0, 0)

In [146]:
# for col in df_anomaly_logs.columns:
#     try:
#         unique_vals = df_anomaly_logs[col].unique()
#         print(f"Column: {col}")
#         print(f"Unique values ({len(unique_vals)}): {unique_vals}")
#         print("-" * 60)
#     except TypeError as e:
#         print(f"Skipped column '{col}' due to unhashable data type: {e}")

In [147]:
df_anomaly_logs

In [148]:
df_logs_errors.shape

(10158, 10)

In [149]:
# for col in df_logs_errors.columns:
#     try:
#         unique_vals = df_logs_errors[col].unique()
#         print(f"Column: {col}")
#         print(f"Unique values ({len(unique_vals)}): {unique_vals}")
#         print("-" * 60)
#     except TypeError as e:
#         print(f"Skipped column '{col}' due to unhashable data type: {e}")

In [150]:
df_logs_errors

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,error_message,is_error
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",True
3,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False
...,...,...,...,...,...,...,...,...,...,...
10153,T3F_TensorTrain_image-1,TensorTrain,image-1,Python,T3F,Dense,"CPU, GPU",tensorflow,,False
10154,T3F_TensorTrain_image-2,TensorTrain,image-2,Python,T3F,Dense,"CPU, GPU",tensorflow,,False
10155,T3F_TensorTrain_video-0,TensorTrain,video-0,Python,T3F,Dense,"CPU, GPU",tensorflow,,False
10156,T3F_TensorTrain_video-1,TensorTrain,video-1,Python,T3F,Dense,"CPU, GPU",tensorflow,,False


## Обработка ошибок

In [151]:
def extract_memory_info(error_message):
    pattern = r"(\d+\.?\d*)\s*(\w+iB)"
    match = re.search(pattern, error_message)

    if match:
        value = float(match.group(1))  # Память
        unit = match.group(2)  # Тип памяти (единица измерения)

        # Конвертация в MiB
        conversion_factors = {
            "KiB": 1 / 1024,  # Килобайты -> Мегабайты
            "MiB": 1,  # Мегабайты -> Мегабайты
            "GiB": 1024,  # Гигабайты -> Мегабайты
            "TiB": 1024 ** 2,  # Терабайты -> Мегабайты
            "PiB": 1024 ** 3,  # Петабайты -> Мегабайты
            "EiB": 1024 ** 4,  # Эксабайты -> Мегабайты
            "ZiB": 1024 ** 5,  # Зеттабайты -> Мегабайты
            "YiB": 1024 ** 6,  # Йоттабайты -> Мегабайты
        }

        if unit in conversion_factors:
            value_in_mib = value * conversion_factors[unit]
        else:
            raise ValueError(f"Неизвестная единица измерения памяти: {unit}")

        return value_in_mib, "MiB"

    return None, None

In [152]:
error_messages_count = 0

df_logs_errors_without_singular_matrix = pd.DataFrame()
df_logs_errors_only_singular_matrix = pd.DataFrame()

for index, row in df_logs_errors.iterrows():
    row_copy = row.copy()
    error_message = row_copy["error_message"]

    if "Tried to allocate" in error_message:  # pytorch
        value, unit = extract_memory_info(error_message)
        if not (value and unit):
            print("Не удалось извлечь информацию о памяти из GPU and RAM ошибки.")
    elif "Unable to allocate" in error_message:  # numpy
        value, unit = extract_memory_info(error_message)
        if not (value and unit):
            print("Не удалось извлечь информацию о памяти из RAM ошибки.")
    elif row_copy["error_message"] == "":
        value = 0.0
    else:
        value = None
        error_messages_count += 1
        print(error_message)

    if value is not None:
        row_copy["memory_tried_to_allocated_error_mb"] = value
        df_logs_errors_without_singular_matrix = pd.concat(
            [df_logs_errors_without_singular_matrix, pd.DataFrame([row_copy])], ignore_index=True)
    else:
        df_logs_errors_only_singular_matrix = pd.concat([df_logs_errors_only_singular_matrix, pd.DataFrame([row_copy])],
                                                        ignore_index=True)

print(error_messages_count)

Пропущена итерация из-за ошибки: pytorch, truncated_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, symeig_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, randomized_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, truncated_svd, random, False, False, 1e-05, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, symeig_svd, random, False, False, 1e-05, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, randomized_svd, random,

In [153]:
print(
    df_logs_errors.shape,
    df_logs_errors.columns,
    df_logs_errors_without_singular_matrix.shape,
    df_logs_errors_without_singular_matrix.columns,
    df_logs_errors_only_singular_matrix.shape,
    df_logs_errors_only_singular_matrix.columns,
    sep="\n"
)

(10158, 10)
Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'error_message',
       'is_error'],
      dtype='object')
(9828, 11)
Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'error_message',
       'is_error', 'memory_tried_to_allocated_error_mb'],
      dtype='object')
(330, 10)
Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'error_message',
       'is_error'],
      dtype='object')


# Функции для анализа логов

In [154]:
# Отрисовка горизонтальных бар-графиков для метрик
def plot_barh(ax, x_data, y_data, title, xlabel, best_value=None, best_label=None, color="green"):
    ax.barh(y_data, x_data, color=color)
    ax.set_title(title, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=12)
    if best_value is not None and best_label is not None:
        ax.axvline(x=best_value, color="red", linestyle="--", label=best_label)
        ax.legend()
    for i in range(len(x_data)):
        ax.text(x_data.iloc[i], i, f"{x_data.iloc[i]:.6f}", va="center")

In [155]:
def get_metrics_data(filtered_df):
    filtered_df_unique_by_method_name = filtered_df["method_name"].unique()
    analysis_data = []

    for method in filtered_df_unique_by_method_name:
        method_specific_data = filtered_df[filtered_df["method_name"] == method]

        compression_ratio = method_specific_data["compression_ratio_median"].to_numpy()[0]
        duration = method_specific_data["duration_median"].to_numpy()[0]
        total_memory = (
                method_specific_data["ram_mem_used_mb_median"].to_numpy()[0]
                + method_specific_data["gpu_allocated_memory_used_mb_median"].to_numpy()[0]
        )
        frobenius_error = method_specific_data["frobenius_error_median"].to_numpy()[0]

        analysis_data.append(
            {
                "method_name": method,
                "duration_median": duration,
                "total_memory_median": total_memory,
                "frobenius_error_median": frobenius_error,
                "compression_ratio_median": compression_ratio,
            }
        )

    return pd.DataFrame(analysis_data)

In [156]:
# Анализ и визуализация методов по количественным показателям в разрезе качественного показателя
def analyze_data_dfs_best_metrics(df_logs, group_name: str = "decomposition_method"):
    groups = df_logs[group_name].unique()
    analysis_results = {}

    for group in groups:
        filtered_df = df_logs[df_logs[group_name] == group]

        analysis_df = get_metrics_data(filtered_df)

        best_time_method = analysis_df.loc[analysis_df["duration_median"].idxmin()]
        best_memory_method = analysis_df.loc[analysis_df["total_memory_median"].idxmin()]
        best_error_method = analysis_df.loc[analysis_df["frobenius_error_median"].idxmin()]

        analysis_results[group] = {
            "best_time_method": best_time_method.to_dict(),
            "best_memory_method": best_memory_method.to_dict(),
            "best_error_method": best_error_method.to_dict(),
        }

    return analysis_results

In [157]:
# Анализ и визуализация методов по количественным показателям в разрезе качественного показателя
def analyze_data_dfs_worst_metrics(df_logs, group_name: str = "decomposition_method"):
    groups = df_logs[group_name].unique()
    analysis_results = {}

    for group in groups:
        filtered_df = df_logs[df_logs[group_name] == group]

        analysis_df = get_metrics_data(filtered_df)

        best_time_method = analysis_df.loc[analysis_df["duration_median"].idxmax()]
        best_memory_method = analysis_df.loc[analysis_df["total_memory_median"].idxmax()]
        best_error_method = analysis_df.loc[analysis_df["frobenius_error_median"].idxmax()]

        analysis_results[group] = {
            "worst_time_method": best_time_method.to_dict(),
            "worst_memory_method": best_memory_method.to_dict(),
            "worst_error_method": best_error_method.to_dict(),
        }

    return analysis_results

# Расчет лучших метрик

## Сравнение методов по типам данных

In [158]:
analysis_results_data_type = analyze_data_dfs_best_metrics(df_clean_logs, group_name="data_type")

In [159]:
analysis_worst_results_data_type = analyze_data_dfs_worst_metrics(df_clean_logs, group_name="data_type")

## Сравнение метрик по методам

In [160]:
analysis_results_decomposition_method = analyze_data_dfs_best_metrics(df_clean_logs, group_name="decomposition_method")

## Сравнение метрик по методам и типам данных

In [161]:
data_types = df_clean_logs["data_type"].unique()

df_logs_by_data_types_dict = {data_type: df_clean_logs[df_clean_logs["data_type"] == data_type] for data_type in
                              data_types}

analysis_results_data_types_decompositions_methods = {
    data_type: analyze_data_dfs_best_metrics(df_logs_by_data_types_dict[data_type], group_name="decomposition_method")
    for data_type in data_types
}

In [162]:
data_types = df_clean_logs["data_type"].unique()

df_logs_by_data_types_dict_worst = {data_type: df_clean_logs[df_clean_logs["data_type"] == data_type] for data_type in
                              data_types}

analysis_results_data_types_decompositions_methods_worst = {
    data_type: analyze_data_dfs_worst_metrics(df_logs_by_data_types_dict_worst[data_type], group_name="decomposition_method")
    for data_type in data_types
}

# Сравнение метрик

## Метрики в разрезе типов данных

In [163]:
pprint(analysis_results_data_type, indent=4)

{   'eeg-0': {   'best_error_method': {   'compression_ratio_median': 52.237848577500216,
                                          'duration_median': 18.240202498000144,
                                          'frobenius_error_median': 0.023497804068028927,
                                          'method_name': 'TensorLy_TensorTrain_eeg-0_pytorch_randomized_svd',
                                          'total_memory_median': 5759.78515625},
                 'best_memory_method': {   'compression_ratio_median': 52.237848577500216,
                                           'duration_median': 2.4513157450001017,
                                           'frobenius_error_median': 0.02617962018121034,
                                           'method_name': 'TensorLy_TensorTrain_eeg-0_pytorch_truncated_svd',
                                           'total_memory_median': 5404.52197265625},
                 'best_time_method': {   'compression_ratio_median': 52.237848577500216,
 

In [164]:
pprint(analysis_worst_results_data_type, indent=4)

{   'eeg-0': {   'worst_error_method': {   'compression_ratio_median': 51.369509260614535,
                                           'duration_median': 6.294206604999999,
                                           'frobenius_error_median': 1.610640063881874,
                                           'method_name': 'TensorLy_Tucker_eeg-0_pytorch_truncated_svd_random',
                                           'total_memory_median': 9475.07666015625},
                 'worst_memory_method': {   'compression_ratio_median': 51.369509260614535,
                                            'duration_median': 6.294206604999999,
                                            'frobenius_error_median': 1.610640063881874,
                                            'method_name': 'TensorLy_Tucker_eeg-0_pytorch_truncated_svd_random',
                                            'total_memory_median': 9475.07666015625},
                 'worst_time_method': {   'compression_ratio_median': 51.36950926

## Метрики в разрезе методов

In [165]:
pprint(analysis_results_decomposition_method, indent=4)

{   'CP': {   'best_error_method': {   'compression_ratio_median': 50.06507925904784,
                                       'duration_median': 0.8362929030008672,
                                       'frobenius_error_median': 0.06767631857655942,
                                       'method_name': 'TensorLy_CP_image-2_pytorch_truncated_svd_random_False_True_abs_rec_error_1.0_1e-08',
                                       'total_memory_median': 7238.0615234375},
              'best_memory_method': {   'compression_ratio_median': 50.020879006159305,
                                        'duration_median': 1.1333491929690354,
                                        'frobenius_error_median': 0.8004575036466122,
                                        'method_name': 'TensorLy_CP_image-1_pytorch_truncated_svd_random_False_True_rec_error_0.01_1e-05',
                                        'total_memory_median': 3602.08349609375},
              'best_time_method': {   'compression_rati

## Метрики в разрезе типов данных и методов

In [166]:
for data_type, metrics in analysis_results_data_types_decompositions_methods.items():
    print(f"\n{data_type}\n")
    pprint(metrics, indent=4)


image-0

{   'CP': {   'best_error_method': {   'compression_ratio_median': 50.05868249417367,
                                       'duration_median': 0.8219312719884329,
                                       'frobenius_error_median': 0.3219344187527895,
                                       'method_name': 'TensorLy_CP_image-0_pytorch_randomized_svd_random_True_False_rec_error_0.001_1e-09',
                                       'total_memory_median': 4992.32763671875},
              'best_memory_method': {   'compression_ratio_median': 50.05868249417367,
                                        'duration_median': 0.5388988000049721,
                                        'frobenius_error_median': 0.4365097265690565,
                                        'method_name': 'TensorLy_CP_image-0_pytorch_truncated_svd_random_False_False_abs_rec_error_0_1e-08',
                                        'total_memory_median': 4157.3779296875},
              'best_time_method': {   'compres

In [167]:
for data_type, metrics in analysis_results_data_types_decompositions_methods_worst.items():
    print(f"\n{data_type}\n")
    pprint(metrics, indent=4)


image-0

{   'CP': {   'worst_error_method': {   'compression_ratio_median': 50.05868249417367,
                                        'duration_median': 0.09860344001208432,
                                        'frobenius_error_median': 13.67434561252594,
                                        'method_name': 'TensorLy_CP_image-0_pytorch_truncated_svd_random_True_True_rec_error_1.0_1e-06',
                                        'total_memory_median': 4799.37841796875},
              'worst_memory_method': {   'compression_ratio_median': 50.05868249417367,
                                         'duration_median': 1.5522369119998984,
                                         'frobenius_error_median': 4.176430404186249,
                                         'method_name': 'TensorLy_CP_image-0_pytorch_truncated_svd_random_True_False_abs_rec_error_0.5_1e-05',
                                         'total_memory_median': 7426.19677734375},
              'worst_time_method': {   

# Аналитика логов количественных и качественных метрик

## Обработка данных

In [168]:
df_clean_logs.shape

(4772, 30)

In [169]:
df_clean_logs.head()

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,run_type,original_run_indices,...,duration_mean,duration_median,frobenius_error_min,frobenius_error_max,frobenius_error_mean,frobenius_error_median,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio_median
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.307812,1.774875,0.672185,0.672185,0.672185,0.672185,50.166931,50.166931,50.166931,50.166931
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.658001,2.638634,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.753213,2.702102,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
3,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,1.373213,1.388822,0.672679,0.672679,0.672679,0.672679,50.166931,50.166931,50.166931,50.166931
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.71721,2.757429,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931


In [170]:
df_clean_logs.columns

Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'run_type',
       'original_run_indices', 'gpu_allocated_memory_used_mb_min',
       'gpu_allocated_memory_used_mb_max', 'gpu_allocated_memory_used_mb_mean',
       'gpu_allocated_memory_used_mb_median', 'ram_mem_used_mb_min',
       'ram_mem_used_mb_max', 'ram_mem_used_mb_mean', 'ram_mem_used_mb_median',
       'duration_min', 'duration_max', 'duration_mean', 'duration_median',
       'frobenius_error_min', 'frobenius_error_max', 'frobenius_error_mean',
       'frobenius_error_median', 'compression_ratio_min',
       'compression_ratio_max', 'compression_ratio_mean',
       'compression_ratio_median'],
      dtype='object')

In [171]:
df_filtered_for_analytics = df_clean_logs[
    [
        "method_name",
        "decomposition_method",
        "data_type",
        "backend",

        "gpu_allocated_memory_used_mb_min",
        "gpu_allocated_memory_used_mb_median",
        "gpu_allocated_memory_used_mb_mean",
        "gpu_allocated_memory_used_mb_max",

        "ram_mem_used_mb_min",
        "ram_mem_used_mb_median",
        "ram_mem_used_mb_mean",
        "ram_mem_used_mb_max",

        "duration_min",
        "duration_median",
        "duration_mean",
        "duration_max",

        "frobenius_error_min",
        "frobenius_error_median",
        "frobenius_error_mean",
        "frobenius_error_max",

        "compression_ratio_min",
        "compression_ratio_median",
        "compression_ratio_mean",
        "compression_ratio_max",
    ]
]


In [172]:
dfs_by_decomposition_method = {
    decomposition_method: df_filtered_for_analytics[
        df_filtered_for_analytics["decomposition_method"] == decomposition_method
        ]
    for decomposition_method in df_filtered_for_analytics["decomposition_method"].unique()
}

In [173]:
for decomposition_method, df_by_decomposition_method in dfs_by_decomposition_method.items():
    enriched_rows = []
    for _, row in df_by_decomposition_method.iterrows():
        matching_log = next((log for log in logs if row["method_name"] == log["method_name"]), None)

        enriched_row = row.copy()

        if row["decomposition_method"] == "Tucker":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")

            if enriched_row["svd"] is None or enriched_row["init"] is None:
                print(matching_log["method_args"])
        elif row["decomposition_method"] == "TensorTrain":
            if "T3F" in row["method_name"]:
                enriched_row["svd"] = "T3F implementation"
            else:
                enriched_row["svd"] = matching_log["method_args"].get("svd")

            if enriched_row["svd"] is None:
                print(matching_log["method_args"])
        elif row["decomposition_method"] == "CP":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
            enriched_row["normalize_factors"] = matching_log["method_args"].get("normalize_factors")
            enriched_row["orthogonalise"] = matching_log["method_args"].get("orthogonalise")
            enriched_row["tol"] = matching_log["method_args"].get("tol")
            enriched_row["l2_reg"] = matching_log["method_args"].get("l2_reg")
            enriched_row["cvg_criterion"] = matching_log["method_args"].get("cvg_criterion")

            if (
                    enriched_row["svd"] is None or
                    enriched_row["init"] is None or
                    enriched_row["normalize_factors"] is None or
                    enriched_row["orthogonalise"] is None or
                    enriched_row["tol"] is None or
                    enriched_row["l2_reg"] is None or
                    enriched_row["cvg_criterion"] is None
            ):
                print(matching_log["method_args"])

        enriched_rows.append(enriched_row)

    dfs_by_decomposition_method[decomposition_method] = pd.DataFrame(enriched_rows)

In [174]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Обработка ошибок

In [175]:
df_logs_errors_without_singular_matrix.shape

(9828, 11)

In [176]:
df_logs_errors_without_singular_matrix.head()

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,error_message,is_error,memory_tried_to_allocated_error_mb
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False,0.0
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False,0.0
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",True,385996.8
3,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False,0.0
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,,False,0.0


In [177]:
df_logs_errors_without_singular_matrix.columns

Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'error_message',
       'is_error', 'memory_tried_to_allocated_error_mb'],
      dtype='object')

In [178]:
df_filtered_errors_for_analytics = df_logs_errors_without_singular_matrix[
    [
        "method_name",
        "decomposition_method",
        "data_type",
        # "backend",

        "memory_tried_to_allocated_error_mb",
        "is_error"
    ]
]


In [179]:
dfs_errors_by_decomposition_method = {
    decomposition_method: df_filtered_errors_for_analytics[
        df_filtered_errors_for_analytics["decomposition_method"] == decomposition_method
        ]
    for decomposition_method in df_filtered_errors_for_analytics["decomposition_method"].unique()
}

for decomposition_method, df_by_decomposition_method in dfs_errors_by_decomposition_method.items():
    enriched_rows = []
    for _, row in df_by_decomposition_method.iterrows():
        matching_log = next((log for log in logs if row["method_name"] == log["method_name"]), None)

        enriched_row = row.copy()

        if row["decomposition_method"] == "Tucker":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")

            if enriched_row["svd"] is None or enriched_row["init"] is None:
                print(matching_log["method_args"])
        elif row["decomposition_method"] == "TensorTrain":
            if "T3F" in row["method_name"]:
                enriched_row["svd"] = "T3F implementation"
            else:
                enriched_row["svd"] = matching_log["method_args"].get("svd")

            if enriched_row["svd"] is None:
                print(matching_log["method_args"])
        elif row["decomposition_method"] == "CP":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
            enriched_row["normalize_factors"] = matching_log["method_args"].get("normalize_factors")
            enriched_row["orthogonalise"] = matching_log["method_args"].get("orthogonalise")
            enriched_row["tol"] = matching_log["method_args"].get("tol")
            enriched_row["l2_reg"] = matching_log["method_args"].get("l2_reg")
            enriched_row["cvg_criterion"] = matching_log["method_args"].get("cvg_criterion")

            if (
                    enriched_row["svd"] is None or
                    enriched_row["init"] is None or
                    enriched_row["normalize_factors"] is None or
                    enriched_row["orthogonalise"] is None or
                    enriched_row["tol"] is None or
                    enriched_row["l2_reg"] is None or
                    enriched_row["cvg_criterion"] is None
            ):
                print(matching_log["method_args"])

        enriched_rows.append(enriched_row)

    dfs_errors_by_decomposition_method[decomposition_method] = pd.DataFrame(enriched_rows)

In [180]:
dfs_errors_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Методы для графиков

## Обработка данных

In [181]:
def preprocess_dataframe(df):
    df_encoded = df.copy()
    for column in df_encoded.columns:
        if df_encoded[column].dtype == "object" and column != "method_name":
            df_encoded[column] = LabelEncoder().fit_transform(df_encoded[column])

    scaler = MinMaxScaler()
    df_encoded[df_encoded.columns.difference(["method_name"])] = scaler.fit_transform(
        df_encoded[df_encoded.columns.difference(["method_name"])]
    )
    return df_encoded

In [182]:
def compute_categorical_numeric_contributions(df, cat_col, num_col, top_n=5):
    results = {}
    
    dummies = pd.get_dummies(df[cat_col], prefix=cat_col)
    y = df[num_col].astype(float)
    μy, σy = y.mean(), y.std(ddof=0)

    for dummy_col in dummies.columns:
        d = dummies[dummy_col].astype(int)
        μd, σd = d.mean(), d.std(ddof=0)

        φ = d.corr(y)

        contrib = ((d - μd) * (y - μy)) / ((len(df)-1) * σd * σy)

        df_contrib = df[[cat_col, num_col]].copy()
        df_contrib['dummy']   = d
        df_contrib['contrib'] = contrib

        top_pos = df_contrib.nlargest(top_n, 'contrib')
        top_neg = df_contrib.nsmallest(top_n, 'contrib')

        category = dummy_col.split(f"{cat_col}_", 1)[1]
        results[category] = {
            'phi': φ,
            'top_positive': top_pos,
            'top_negative': top_neg
        }

    return results

In [183]:
def compute_numeric_numeric_contributions(df, col_x, col_y, top_n=5):
    x = df[col_x].astype(float)
    y = df[col_y].astype(float)
    μx, σx = x.mean(), x.std(ddof=0)
    μy, σy = y.mean(), y.std(ddof=0)

    r = x.corr(y)

    contrib = ((x - μx) * (y - μy)) / ((len(df)-1) * σx * σy)

    df_contrib = df[[col_x, col_y]].copy()
    df_contrib['contrib'] = contrib

    return {
        'corr': r,
        'top_positive': df_contrib.nlargest(top_n, 'contrib'),
        'top_negative': df_contrib.nsmallest(top_n, 'contrib')
    }


## Вывод графиков

In [184]:
def plot_heatmap(df, title):
    folder_path = Path("../.cache/data_analyze/")

    corr_matrix = df.drop(columns=["method_name"]).corr()

    fig = go.Figure(
        data=go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.index,
            colorscale=colors.sequential.RdBu,
            colorbar={
                "title": "Correlation",
                "titlefont": {"color": "black"},
                "tickfont": {"color": "black"},
            },
            text=corr_matrix.values,
            texttemplate="%{text:.2f}",
            textfont={"size": 12, "color": "black"},
            hovertemplate="<b>%{x}</b> vs. <b>%{y}</b> <b>Correlation:</b> %{z:.8f}<extra></extra>",
        )
    )

    fig.update_layout(
        title=f"Heatmap: {title}",
        title_font={"color": "black"},
        xaxis={"tickangle": -45, "tickfont": {"size": 12, "color": "black"}},
        yaxis={"tickfont": {"size": 12, "color": "black"}},
        width=1400,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    folder_path.mkdir(parents=True, exist_ok=True)

    fig.write_html(folder_path / f"heatmap_{title}.html")

In [185]:
def plot_pca(df, title):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pca_df = df.drop(columns=["method_name"])
    pca = PCA(n_components=3)
    pca_components = pca.fit_transform(pca_df)

    pca_df = pd.DataFrame(
        {
            "PCA1": pca_components[:, 0],
            "PCA2": pca_components[:, 1],
            "PCA3": pca_components[:, 2],
            "method_name": df["method_name"].to_numpy(),
        }
    )

    fig1 = px.scatter(
        pca_df, x="PCA1", y="PCA2", color="method_name", title=f"PCA 2D Projection: {title}", hover_name="method_name"
    )
    fig1.update_traces(marker={"size": 10})
    fig1.update_layout(
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        xaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        yaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        showlegend=False,
    )

    fig1.write_html(folder_path / f"pca_2d_{title}.html")

    fig2 = px.scatter_3d(
        pca_df,
        x="PCA1",
        y="PCA2",
        z="PCA3",
        color="method_name",
        title=f"PCA 3D Projection: {title}",
        hover_name="method_name",
    )
    fig2.update_traces(marker={"size": 5})
    fig2.update_layout(
        scene={
            "xaxis_title": "PCA1",
            "yaxis_title": "PCA2",
            "zaxis_title": "PCA3",
            "xaxis_backgroundcolor": "white",
            "yaxis_backgroundcolor": "white",
            "zaxis_backgroundcolor": "white",
            "xaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "yaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "zaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "bgcolor": "white",
        },
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        showlegend=False,
    )

    fig2.write_html(folder_path / f"pca_3d_{title}.html")

In [186]:
def plot_pairplot(df, title, sample_size=1000, num_axes=100):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pairplot_df = df.drop(columns=["method_name"])

    if len(pairplot_df) > sample_size:
        pairplot_df = pairplot_df.sample(n=sample_size, random_state=42)

    selected_axes = pairplot_df.columns[:num_axes]

    sns.set(style="white")
    g = sns.pairplot(pairplot_df[selected_axes], diag_kind="kde", markers="+", plot_kws={"s": 50})
    g.fig.suptitle(f"Pairplot: {title}", y=1.02, fontsize=16, color="black")

    for ax in g.axes.flatten():
        ax.tick_params(axis="x", labelsize=12, labelcolor="black")
        ax.tick_params(axis="y", labelsize=12, labelcolor="black")
        ax.set_facecolor("white")
        ax.set_xticks(ax.get_xticks())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

    plt.savefig(folder_path / f"pairplot_{title}.png", dpi=300, bbox_inches="tight")
    plt.close("all")

## Энкодинг и нормализация датафреймов

In [187]:
dfs_by_decomposition_method_encoded = {}

for key, df_by_decomposition_method in dfs_by_decomposition_method.items():
    dfs_by_decomposition_method_encoded[key] = preprocess_dataframe(
        df_by_decomposition_method.drop("decomposition_method", axis=1)
    )

In [188]:
dfs_errors_by_decomposition_method_encoded = {}

for key, df_by_decomposition_method in dfs_errors_by_decomposition_method.items():
    dfs_errors_by_decomposition_method_encoded[key] = preprocess_dataframe(
        df_by_decomposition_method.drop("decomposition_method", axis=1)
    )

# Графики для данных

## Heatmap

### по методам декомпозиции

In [189]:
# method_name = "Tucker_data"
# plot_heatmap(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "TensorTrain_data"
# plot_heatmap(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "CP_data"
# plot_heatmap(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)

### Heatmaps по разным типам тензоров и отдельным тензорам

#### Tucker

##### images

In [190]:
# df_tucker_image_0 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'image-0']
# df_tucker_image_0_encoded = preprocess_dataframe(
#         df_tucker_image_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_image_0_encoded, title="Tucker_data_image-0")
# 
# 
# df_tucker_image_1 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'image-1']
# df_tucker_image_1_encoded = preprocess_dataframe(
#         df_tucker_image_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_image_1_encoded, title="Tucker_data_image-1")
# 
# 
# df_tucker_image_2 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'image-2']
# df_tucker_image_2_encoded = preprocess_dataframe(
#         df_tucker_image_2.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_image_2_encoded, title="Tucker_data_image-2")
# 
# 
# mask = dfs_by_decomposition_method['Tucker']['data_type'].isin(['image-0', 'image-1', 'image-2'])
# df_tucker_images = dfs_by_decomposition_method['Tucker'].loc[mask]
# df_tucker_images_encoded = preprocess_dataframe(
#         df_tucker_images.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_images_encoded, title="Tucker_data_images")

##### videos

In [191]:
# df_tucker_video_0 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'video-0']
# df_tucker_video_0_encoded = preprocess_dataframe(
#         df_tucker_video_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_video_0_encoded, title="Tucker_data_video-0")
# 
# 
# df_tucker_video_1 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'video-1']
# df_tucker_video_1_encoded = preprocess_dataframe(
#         df_tucker_video_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_video_1_encoded, title="Tucker_data_video-1")
# 
# 
# df_tucker_video_2 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'video-2']
# df_tucker_video_2_encoded = preprocess_dataframe(
#         df_tucker_video_2.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_video_2_encoded, title="Tucker_data_video-2")
# 
# 
# mask = dfs_by_decomposition_method['Tucker']['data_type'].isin(['video-0', 'video-1', 'video-2'])
# df_tucker_videos = dfs_by_decomposition_method['Tucker'].loc[mask]
# df_tucker_videos_encoded = preprocess_dataframe(
#         df_tucker_videos.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_videos_encoded, title="Tucker_data_videos")

##### EEGs

In [192]:
# df_tucker_eeg_0 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'eeg-0']
# df_tucker_eeg_0_encoded = preprocess_dataframe(
#         df_tucker_eeg_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_eeg_0_encoded, title="Tucker_data_eeg-0")
# 
# 
# df_tucker_eeg_1 = dfs_by_decomposition_method['Tucker'].loc[dfs_by_decomposition_method['Tucker']['data_type'] == 'eeg-1']
# df_tucker_eeg_1_encoded = preprocess_dataframe(
#         df_tucker_eeg_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_eeg_1_encoded, title="Tucker_data_eeg-1")
# 
# 
# mask = dfs_by_decomposition_method['Tucker']['data_type'].isin(['eeg-0', 'eeg-1'])
# df_tucker_eegs = dfs_by_decomposition_method['Tucker'].loc[mask]
# df_tucker_eegs_encoded = preprocess_dataframe(
#         df_tucker_eegs.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tucker_eegs_encoded, title="Tucker_data_eegs")

#### CP

##### images

In [193]:
# df_CP_image_0 = dfs_by_decomposition_method['CP'].loc[dfs_by_decomposition_method['CP']['data_type'] == 'image-0']
# df_CP_image_0_encoded = preprocess_dataframe(
#         df_CP_image_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_CP_image_0_encoded, title="CP_data_image-0")
# 
# 
# df_CP_image_1 = dfs_by_decomposition_method['CP'].loc[dfs_by_decomposition_method['CP']['data_type'] == 'image-1']
# df_CP_image_1_encoded = preprocess_dataframe(
#         df_CP_image_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_CP_image_1_encoded, title="CP_data_image-1")
# 
# 
# df_CP_image_2 = dfs_by_decomposition_method['CP'].loc[dfs_by_decomposition_method['CP']['data_type'] == 'image-2']
# df_CP_image_2_encoded = preprocess_dataframe(
#         df_CP_image_2.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_CP_image_2_encoded, title="CP_data_image-2")
# 
# 
# mask = dfs_by_decomposition_method['CP']['data_type'].isin(['image-0', 'image-1', 'image-2'])
# df_CP_images = dfs_by_decomposition_method['CP'].loc[mask]
# df_CP_images_encoded = preprocess_dataframe(
#         df_CP_images.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_CP_images_encoded, title="CP_data_images")

#### TensorTrain

##### images

In [194]:
# df_tensor_train_image_0 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'image-0']
# df_tensor_train_image_0_encoded = preprocess_dataframe(
#         df_tensor_train_image_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_image_0_encoded, title="TensorTrain_data_image-0")
# 
# 
# df_tensor_train_image_1 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'image-1']
# df_tensor_train_image_1_encoded = preprocess_dataframe(
#         df_tensor_train_image_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_image_1_encoded, title="TensorTrain_data_image-1")
# 
# 
# df_tensor_train_image_2 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'image-2']
# df_tensor_train_image_2_encoded = preprocess_dataframe(
#         df_tensor_train_image_2.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_image_2_encoded, title="TensorTrain_data_image-2")
# 
# 
# mask = dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])
# df_tensor_train_images = dfs_by_decomposition_method['TensorTrain'].loc[mask]
# df_tensor_train_images_encoded = preprocess_dataframe(
#         df_tensor_train_images.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_images_encoded, title="TensorTrain_data_images")

##### videos

In [195]:
# df_tensor_train_video_0 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'video-0']
# df_tensor_train_video_0_encoded = preprocess_dataframe(
#         df_tensor_train_video_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_video_0_encoded, title="TensorTrain_data_video-0")
# 
# 
# df_tensor_train_video_1 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'video-1']
# df_tensor_train_video_1_encoded = preprocess_dataframe(
#         df_tensor_train_video_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_video_1_encoded, title="TensorTrain_data_video-1")
# 
# 
# df_tensor_train_video_2 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'video-2']
# df_tensor_train_video_2_encoded = preprocess_dataframe(
#         df_tensor_train_video_2.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_video_2_encoded, title="TensorTrain_data_video-2")
# 
# 
# mask = dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['video-0', 'video-1', 'video-2'])
# df_tensor_train_videos = dfs_by_decomposition_method['TensorTrain'].loc[mask]
# df_tensor_train_videos_encoded = preprocess_dataframe(
#         df_tensor_train_videos.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_videos_encoded, title="TensorTrain_data_videos")

##### EEGs

In [196]:
# df_tensor_train_eeg_0 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'eeg-0']
# df_tensor_train_eeg_0_encoded = preprocess_dataframe(
#         df_tensor_train_eeg_0.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_eeg_0_encoded, title="TensorTrain_data_eeg-0")
# 
# 
# df_tensor_train_eeg_1 = dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'] == 'eeg-1']
# df_tensor_train_eeg_1_encoded = preprocess_dataframe(
#         df_tensor_train_eeg_1.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_eeg_1_encoded, title="TensorTrain_data_eeg-1")
# 
# 
# mask = dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['eeg-0', 'eeg-1'])
# df_tensor_train_eegs = dfs_by_decomposition_method['TensorTrain'].loc[mask]
# df_tensor_train_eegs_encoded = preprocess_dataframe(
#         df_tensor_train_eegs.drop("decomposition_method", axis=1)
# )
# plot_heatmap(df_tensor_train_eegs_encoded, title="TensorTrain_data_eegs")

## PCA

In [197]:
# method_name = "Tucker_data"
# plot_pca(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "TensorTrain_data"
# plot_pca(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "CP_data"
# plot_pca(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)

## Pairplot

In [198]:
# method_name = "Tucker_data"
# plot_pairplot(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None
# 
# method_name = "TensorTrain_data"
# plot_pairplot(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None
# 
# method_name = "CP_data"
# plot_pairplot(dfs_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None

# Графики для ошибок

## Heatmap

In [199]:
# method_name = "Tucker_errors"
# plot_heatmap(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "TensorTrain_errors"
# plot_heatmap(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "CP_errors"
# plot_heatmap(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)

## PCA

In [200]:
# method_name = "Tucker_errors"
# plot_pca(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "TensorTrain_errors"
# plot_pca(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# 
# method_name = "CP_errors"
# plot_pca(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)

## Pairplot

In [201]:
# method_name = "Tucker_errors"
# plot_pairplot(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None
# 
# method_name = "TensorTrain_errors"
# plot_pairplot(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None
# 
# method_name = "CP_errors"
# plot_pairplot(dfs_errors_by_decomposition_method_encoded[method_name.split("_")[0]], title=method_name)
# None

# Анализ на основе выводов из графиков

In [202]:
def get_n_rows_near_quantile(df, column, quantile_value, n=10):
    diffs = (df[column] - quantile_value).abs()
    nearest_indices = diffs.nsmallest(n).index
    return df.loc[nearest_indices].copy()

## ошибки

In [203]:
dfs_errors_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

In [204]:
dfs_errors_by_decomposition_method_encoded.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

### CP

In [205]:
# квантили по памяти
mask = (
        (
                dfs_errors_by_decomposition_method['CP']['is_error'] == True
        )
        &
        (
                dfs_errors_by_decomposition_method['CP']['memory_tried_to_allocated_error_mb'] > 0.0
        )
)
errors_cp = dfs_errors_by_decomposition_method['CP'].loc[
    mask
]

errors_cp_sorted = errors_cp.sort_values(by='memory_tried_to_allocated_error_mb')

quantiles = [0.0, 0.5, 0.95, 0.98, 1.0]
quantile_values = errors_cp_sorted['memory_tried_to_allocated_error_mb'].quantile(quantiles)

rows_by_quantiles = {
    f"{int(q * 100)}%": get_n_rows_near_quantile(errors_cp_sorted, 'memory_tried_to_allocated_error_mb', quantile_values[q], n=10)
    for q in quantiles
}
df_logs_cp_errors_rows_near_all_quantiles = pd.concat(rows_by_quantiles.values(), keys=rows_by_quantiles.keys())

In [206]:
df_logs_cp_errors_rows_near_all_quantiles

Unnamed: 0,Unnamed: 1,method_name,decomposition_method,data_type,memory_tried_to_allocated_error_mb,is_error,init,svd,normalize_factors,orthogonalise,tol,l2_reg,cvg_criterion
0%,9821,TensorLy_CP_image-1_pytorch_randomized_svd_svd...,CP,image-1,248903.68,True,svd,randomized_svd,True,True,1e-09,1.0,rec_error
0%,7343,TensorLy_CP_image-1_pytorch_randomized_svd_svd...,CP,image-1,248903.68,True,svd,randomized_svd,False,True,1e-08,0.0,rec_error
0%,7347,TensorLy_CP_image-1_pytorch_truncated_svd_svd_...,CP,image-1,248903.68,True,svd,truncated_svd,False,True,1e-05,0.0,rec_error
0%,7348,TensorLy_CP_image-1_pytorch_symeig_svd_svd_Fal...,CP,image-1,248903.68,True,svd,symeig_svd,False,True,1e-05,0.0,rec_error
0%,7349,TensorLy_CP_image-1_pytorch_randomized_svd_svd...,CP,image-1,248903.68,True,svd,randomized_svd,False,True,1e-05,0.0,rec_error
0%,7353,TensorLy_CP_image-1_pytorch_truncated_svd_svd_...,CP,image-1,248903.68,True,svd,truncated_svd,False,True,1e-06,0.0,rec_error
0%,7354,TensorLy_CP_image-1_pytorch_symeig_svd_svd_Fal...,CP,image-1,248903.68,True,svd,symeig_svd,False,True,1e-06,0.0,rec_error
0%,7355,TensorLy_CP_image-1_pytorch_randomized_svd_svd...,CP,image-1,248903.68,True,svd,randomized_svd,False,True,1e-06,0.0,rec_error
0%,7359,TensorLy_CP_image-1_pytorch_truncated_svd_svd_...,CP,image-1,248903.68,True,svd,truncated_svd,False,True,1e-07,0.0,rec_error
0%,7360,TensorLy_CP_image-1_pytorch_symeig_svd_svd_Fal...,CP,image-1,248903.68,True,svd,symeig_svd,False,True,1e-07,0.0,rec_error


In [207]:
dfs_errors_by_decomposition_method['CP'].loc[
    dfs_errors_by_decomposition_method['CP']['is_error'] == False,
    'init'
].value_counts(dropna=False)

init
random    4710
Name: count, dtype: int64

In [208]:
dfs_errors_by_decomposition_method['CP'].loc[
    dfs_errors_by_decomposition_method['CP']['is_error'] == True,
    'init'
].value_counts(dropna=False)

init
svd    5040
Name: count, dtype: int64

значение для init = svd приводит к резкому увеличению требуемой памяти, требуемая память зависит от начального тензора

### Tensor-Train

In [209]:
dfs_errors_by_decomposition_method['TensorTrain'][
    'svd'
].value_counts(dropna=False)

svd
randomized_svd        8
truncated_svd         8
symeig_svd            8
T3F implementation    6
Name: count, dtype: int64

In [210]:
dfs_errors_by_decomposition_method['TensorTrain'].loc[
    dfs_errors_by_decomposition_method['TensorTrain']['is_error'] == False,
    'svd'
].value_counts(dropna=False)

svd
randomized_svd        8
truncated_svd         8
T3F implementation    6
Name: count, dtype: int64

In [211]:
dfs_errors_by_decomposition_method['TensorTrain'].loc[
    dfs_errors_by_decomposition_method['TensorTrain']['is_error'] == True
]

Unnamed: 0,method_name,decomposition_method,data_type,memory_tried_to_allocated_error_mb,is_error,svd
50,TensorLy_TensorTrain_image-0_pytorch_symeig_svd,TensorTrain,image-0,54538.24,True,symeig_svd
53,TensorLy_TensorTrain_image-1_pytorch_symeig_svd,TensorTrain,image-1,79605.76,True,symeig_svd
56,TensorLy_TensorTrain_image-2_pytorch_symeig_svd,TensorTrain,image-2,454983.7,True,symeig_svd
59,TensorLy_TensorTrain_video-0_pytorch_symeig_svd,TensorTrain,video-0,46653.44,True,symeig_svd
62,TensorLy_TensorTrain_video-1_pytorch_symeig_svd,TensorTrain,video-1,26245.12,True,symeig_svd
65,TensorLy_TensorTrain_video-2_pytorch_symeig_svd,TensorTrain,video-2,46653.44,True,symeig_svd
68,TensorLy_TensorTrain_eeg-0_pytorch_symeig_svd,TensorTrain,eeg-0,3322946000.0,True,symeig_svd
71,TensorLy_TensorTrain_eeg-1_pytorch_symeig_svd,TensorTrain,eeg-1,11842370000.0,True,symeig_svd


symeig_svd в tensorly в tucker вызывает ошибку выделения памяти

### Tucker

In [212]:
dfs_errors_by_decomposition_method['Tucker'][
    'init'
].value_counts(dropna=False)

init
svd       24
random    24
Name: count, dtype: int64

In [213]:
dfs_errors_by_decomposition_method['Tucker'][
    'svd'
].value_counts(dropna=False)

svd
truncated_svd     16
symeig_svd        16
randomized_svd    16
Name: count, dtype: int64

In [214]:
dfs_errors_by_decomposition_method['Tucker'].loc[
    dfs_errors_by_decomposition_method['Tucker']['is_error'] == False,
    'init'
].value_counts(dropna=False)

init
random    24
svd       16
Name: count, dtype: int64

In [215]:
dfs_errors_by_decomposition_method['Tucker'].loc[
    dfs_errors_by_decomposition_method['Tucker']['is_error'] == True,
    'init'
].value_counts(dropna=False)

init
svd    8
Name: count, dtype: int64

In [216]:
dfs_errors_by_decomposition_method['Tucker'].loc[
    dfs_errors_by_decomposition_method['Tucker']['is_error'] == True
]

Unnamed: 0,method_name,decomposition_method,data_type,memory_tried_to_allocated_error_mb,is_error,init,svd
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_svd,Tucker,image-0,385996.8,True,svd,symeig_svd
8,TensorLy_Tucker_image-1_pytorch_symeig_svd_svd,Tucker,image-1,248903.7,True,svd,symeig_svd
14,TensorLy_Tucker_image-2_pytorch_symeig_svd_svd,Tucker,image-2,2586040.0,True,svd,symeig_svd
20,TensorLy_Tucker_video-0_pytorch_symeig_svd_svd,Tucker,video-0,46653.44,True,svd,symeig_svd
26,TensorLy_Tucker_video-1_pytorch_symeig_svd_svd,Tucker,video-1,26245.12,True,svd,symeig_svd
32,TensorLy_Tucker_video-2_pytorch_symeig_svd_svd,Tucker,video-2,46653.44,True,svd,symeig_svd
38,TensorLy_Tucker_eeg-0_pytorch_symeig_svd_svd,Tucker,eeg-0,3322946000.0,True,svd,symeig_svd
44,TensorLy_Tucker_eeg-1_pytorch_symeig_svd_svd,Tucker,eeg-1,11842370000.0,True,svd,symeig_svd


init = svd и svd = symeig_svd когда ставятся одновременно приводят к взрыву требуемой памяти

In [217]:
mask = (
        (
                dfs_errors_by_decomposition_method['Tucker']['svd'] == 'symeig_svd'
        )
)

dfs_errors_by_decomposition_method['Tucker'].loc[
    mask
].value_counts(dropna=False)

method_name                                        decomposition_method  data_type  memory_tried_to_allocated_error_mb  is_error  init    svd       
TensorLy_Tucker_eeg-0_pytorch_symeig_svd_random    Tucker                eeg-0      0.000000e+00                        False     random  symeig_svd    1
TensorLy_Tucker_eeg-0_pytorch_symeig_svd_svd       Tucker                eeg-0      3.322946e+09                        True      svd     symeig_svd    1
TensorLy_Tucker_eeg-1_pytorch_symeig_svd_random    Tucker                eeg-1      0.000000e+00                        False     random  symeig_svd    1
TensorLy_Tucker_eeg-1_pytorch_symeig_svd_svd       Tucker                eeg-1      1.184237e+10                        True      svd     symeig_svd    1
TensorLy_Tucker_image-0_pytorch_symeig_svd_random  Tucker                image-0    0.000000e+00                        False     random  symeig_svd    1
TensorLy_Tucker_image-0_pytorch_symeig_svd_svd     Tucker                image-0 

In [218]:
mask = (
        (
                dfs_errors_by_decomposition_method['Tucker']['init'] == 'svd'
        )
)

dfs_errors_by_decomposition_method['Tucker'].loc[
    mask
].value_counts(dropna=False)

method_name                                         decomposition_method  data_type  memory_tried_to_allocated_error_mb  is_error  init  svd           
TensorLy_Tucker_eeg-0_pytorch_randomized_svd_svd    Tucker                eeg-0      0.000000e+00                        False     svd   randomized_svd    1
TensorLy_Tucker_eeg-0_pytorch_symeig_svd_svd        Tucker                eeg-0      3.322946e+09                        True      svd   symeig_svd        1
TensorLy_Tucker_video-2_pytorch_symeig_svd_svd      Tucker                video-2    4.665344e+04                        True      svd   symeig_svd        1
TensorLy_Tucker_video-2_pytorch_randomized_svd_svd  Tucker                video-2    0.000000e+00                        False     svd   randomized_svd    1
TensorLy_Tucker_video-1_pytorch_truncated_svd_svd   Tucker                video-1    0.000000e+00                        False     svd   truncated_svd     1
TensorLy_Tucker_video-1_pytorch_symeig_svd_svd      Tucker     

## данные

In [219]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

In [220]:
dfs_by_decomposition_method_encoded.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

### Tucker

In [260]:
dfs_by_decomposition_method['Tucker']

Unnamed: 0,method_name,decomposition_method,data_type,backend,gpu_allocated_memory_used_mb_min,gpu_allocated_memory_used_mb_median,gpu_allocated_memory_used_mb_mean,gpu_allocated_memory_used_mb_max,ram_mem_used_mb_min,ram_mem_used_mb_median,...,frobenius_error_min,frobenius_error_median,frobenius_error_mean,frobenius_error_max,compression_ratio_min,compression_ratio_median,compression_ratio_mean,compression_ratio_max,init,svd
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,pytorch,19.552246,19.678223,20.332129,23.048828,6175.277344,6184.375,...,0.672185,0.672185,0.672185,0.672185,50.166931,50.166931,50.166931,50.166931,svd,truncated_svd
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,pytorch,14.450195,14.621094,14.709375,14.927246,6191.238281,6200.539062,...,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931,random,truncated_svd
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,pytorch,14.450195,14.621094,14.709375,14.927246,6222.480469,6222.605469,...,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931,random,symeig_svd
3,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,pytorch,18.64209,18.64209,18.95127,19.415039,6271.574219,6271.574219,...,0.672679,0.672679,0.672679,0.672679,50.166931,50.166931,50.166931,50.166931,svd,randomized_svd
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,pytorch,14.578613,14.927246,14.796289,14.927246,6273.535156,6273.535156,...,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931,random,randomized_svd
5,TensorLy_Tucker_image-1_pytorch_truncated_svd_svd,Tucker,image-1,pytorch,16.817383,16.817383,16.817383,16.817383,6273.597656,6273.597656,...,2.446034,2.446034,2.446034,2.446034,50.413535,50.413535,50.413535,50.413535,svd,truncated_svd
6,TensorLy_Tucker_image-1_pytorch_truncated_svd_...,Tucker,image-1,pytorch,14.342773,14.342773,14.349512,14.376465,6273.597656,6273.597656,...,2.452591,2.452591,2.452591,2.452591,50.413535,50.413535,50.413535,50.413535,random,truncated_svd
7,TensorLy_Tucker_image-1_pytorch_symeig_svd_random,Tucker,image-1,pytorch,14.342773,14.342773,14.349512,14.376465,6273.597656,6273.601562,...,2.452591,2.452591,2.452591,2.452591,50.413535,50.413535,50.413535,50.413535,random,symeig_svd
8,TensorLy_Tucker_image-1_pytorch_randomized_svd...,Tucker,image-1,pytorch,14.822266,14.822266,14.860449,15.013184,6273.601562,6273.601562,...,2.449736,2.449736,2.449736,2.449736,50.413535,50.413535,50.413535,50.413535,svd,randomized_svd
9,TensorLy_Tucker_image-1_pytorch_randomized_svd...,Tucker,image-1,pytorch,14.342773,14.342773,14.342773,14.342773,6273.601562,6273.601562,...,2.452591,2.452591,2.452591,2.452591,50.413535,50.413535,50.413535,50.413535,random,randomized_svd


#### Tucker - init

In [259]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['Tucker'],
    cat_col='init',
    num_col='duration_median'
)

{'random': {'phi': -0.25384535484628484,
  'top_positive':       init  duration_median  dummy   contrib
  20     svd         0.479412      0  0.016234
  8      svd         1.048175      0  0.014342
  10     svd         1.149111      0  0.014006
  37  random        11.426135      1  0.013458
  36  random        11.421377      1  0.013447,
  'top_negative':       init  duration_median  dummy   contrib
  38     svd        50.102457      0 -0.148865
  33     svd        36.834584      0 -0.104722
  35     svd        10.625262      0 -0.017522
  30     svd         9.623638      0 -0.014190
  22  random         0.367322      1 -0.011071},
 'svd': {'phi': 0.25384535484628484,
  'top_positive':       init  duration_median  dummy   contrib
  38     svd        50.102457      1  0.148865
  33     svd        36.834584      1  0.104722
  35     svd        10.625262      1  0.017522
  30     svd         9.623638      1  0.014190
  22  random         0.367322      0  0.011071,
  'top_negative':       

### CP

In [222]:
dfs_by_decomposition_method['CP']['init'].value_counts(dropna=False)

init
random    4710
Name: count, dtype: int64

In [223]:
dfs_by_decomposition_method['CP']

Unnamed: 0,method_name,decomposition_method,data_type,backend,gpu_allocated_memory_used_mb_min,gpu_allocated_memory_used_mb_median,gpu_allocated_memory_used_mb_mean,gpu_allocated_memory_used_mb_max,ram_mem_used_mb_min,ram_mem_used_mb_median,...,compression_ratio_median,compression_ratio_mean,compression_ratio_max,init,svd,normalize_factors,orthogonalise,tol,l2_reg,cvg_criterion
56,TensorLy_CP_image-2_pytorch_truncated_svd_rand...,CP,image-2,pytorch,4129.627441,4130.000488,4130.073926,4130.608887,3094.875000,3094.875000,...,50.065079,50.065079,50.065079,random,truncated_svd,False,False,1.000000e-08,0.0001,abs_rec_error
57,TensorLy_CP_image-2_pytorch_symeig_svd_random_...,CP,image-2,pytorch,4129.637695,4130.000488,4129.963770,4130.134277,3094.882812,3094.882812,...,50.065079,50.065079,50.065079,random,symeig_svd,False,False,1.000000e-08,0.0001,abs_rec_error
58,TensorLy_CP_image-2_pytorch_randomized_svd_ran...,CP,image-2,pytorch,4129.158691,4130.169434,4129.939746,4130.490234,3094.886719,3094.886719,...,50.065079,50.065079,50.065079,random,randomized_svd,False,False,1.000000e-08,0.0001,abs_rec_error
59,TensorLy_CP_image-2_pytorch_truncated_svd_rand...,CP,image-2,pytorch,4129.627441,4130.000488,4130.073926,4130.608887,3094.914062,3094.921875,...,50.065079,50.065079,50.065079,random,truncated_svd,False,False,1.000000e-05,0.0001,abs_rec_error
60,TensorLy_CP_image-2_pytorch_symeig_svd_random_...,CP,image-2,pytorch,4129.637695,4130.000488,4129.963770,4130.134277,3094.933594,3094.933594,...,50.065079,50.065079,50.065079,random,symeig_svd,False,False,1.000000e-05,0.0001,abs_rec_error
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4761,TensorLy_CP_image-1_pytorch_symeig_svd_random_...,CP,image-1,pytorch,726.505371,726.505371,726.505371,726.505371,6362.707031,6367.085938,...,50.020879,50.020879,50.020879,random,symeig_svd,True,True,1.000000e-07,1.0000,rec_error
4762,TensorLy_CP_image-1_pytorch_randomized_svd_ran...,CP,image-1,pytorch,726.505371,726.505371,726.505371,726.505371,6360.949219,6366.464844,...,50.020879,50.020879,50.020879,random,randomized_svd,True,True,1.000000e-07,1.0000,rec_error
4763,TensorLy_CP_image-1_pytorch_truncated_svd_rand...,CP,image-1,pytorch,726.505371,726.505371,726.505371,726.505371,6368.906250,6372.285156,...,50.020879,50.020879,50.020879,random,truncated_svd,True,True,1.000000e-09,1.0000,rec_error
4764,TensorLy_CP_image-1_pytorch_symeig_svd_random_...,CP,image-1,pytorch,726.505371,726.505371,726.505371,726.505371,6364.132812,6369.382812,...,50.020879,50.020879,50.020879,random,symeig_svd,True,True,1.000000e-09,1.0000,rec_error


#### CP - cvg_criterion

In [224]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='cvg_criterion',
    num_col='duration_median'
)

{'abs_rec_error': {'phi': 0.505091903297362,
  'top_positive':       cvg_criterion  duration_median  dummy   contrib
  1287  abs_rec_error         4.746648      1  0.000795
  1290  abs_rec_error         4.726023      1  0.000791
  1297  abs_rec_error         4.714066      1  0.000788
  1289  abs_rec_error         4.701860      1  0.000786
  1300  abs_rec_error         4.697951      1  0.000785,
  'top_negative':      cvg_criterion  duration_median  dummy   contrib
  3250     rec_error         2.531623      0 -0.000295
  3249     rec_error         2.426358      0 -0.000276
  3435     rec_error         2.407998      0 -0.000272
  3087     rec_error         2.371996      0 -0.000265
  3260     rec_error         2.355038      0 -0.000262},
 'rec_error': {'phi': -0.505091903297362,
  'top_positive':      cvg_criterion  duration_median  dummy   contrib
  3250     rec_error         2.531623      1  0.000295
  3249     rec_error         2.426358      1  0.000276
  3435     rec_error         2.

In [225]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='cvg_criterion',
    num_col='frobenius_error_median'
)

{'abs_rec_error': {'phi': 0.08771571590779309,
  'top_positive':       cvg_criterion  frobenius_error_median  dummy   contrib
  1166  abs_rec_error             9198.693085      1  0.002777
  1167  abs_rec_error             9198.693085      1  0.002777
  1168  abs_rec_error             9198.693085      1  0.002777
  1169  abs_rec_error             9198.693085      1  0.002777
  1170  abs_rec_error             9198.693085      1  0.002777,
  'top_negative':     cvg_criterion  frobenius_error_median  dummy   contrib
  221     rec_error              753.678226      0 -0.000182
  222     rec_error              753.678226      0 -0.000182
  223     rec_error              753.678226      0 -0.000182
  224     rec_error              753.678226      0 -0.000182
  225     rec_error              753.678226      0 -0.000182},
 'rec_error': {'phi': -0.08771571590779309,
  'top_positive':     cvg_criterion  frobenius_error_median  dummy   contrib
  221     rec_error              753.678226      1  0

In [226]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='cvg_criterion',
    num_col='gpu_allocated_memory_used_mb_median'
)

{'abs_rec_error': {'phi': -0.08985678389809379,
  'top_positive':      cvg_criterion  gpu_allocated_memory_used_mb_median  dummy   contrib
  311  abs_rec_error                          4141.236816      1  0.000355
  316  abs_rec_error                          4141.236816      1  0.000355
  317  abs_rec_error                          4141.236816      1  0.000355
  322  abs_rec_error                          4141.236816      1  0.000355
  323  abs_rec_error                          4141.236816      1  0.000355,
  'top_negative':     cvg_criterion  gpu_allocated_memory_used_mb_median  dummy   contrib
  825     rec_error                          4139.271973      0 -0.000316
  826     rec_error                          4139.271973      0 -0.000316
  829     rec_error                          4139.271973      0 -0.000316
  834     rec_error                          4139.271973      0 -0.000316
  835     rec_error                          4139.271973      0 -0.000316},
 'rec_error': {'phi': 0

In [227]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='cvg_criterion',
    num_col='ram_mem_used_mb_median'
)

{'abs_rec_error': {'phi': 0.03370519856757869,
  'top_positive':       cvg_criterion  ram_mem_used_mb_median  dummy   contrib
  3670  abs_rec_error             6397.003906      1  0.000295
  3669  abs_rec_error             6396.601562      1  0.000295
  4570  abs_rec_error             6389.765625      1  0.000294
  4561  abs_rec_error             6389.722656      1  0.000294
  4564  abs_rec_error             6389.613281      1  0.000293,
  'top_negative':      cvg_criterion  ram_mem_used_mb_median  dummy   contrib
  3721     rec_error             6397.957031      0 -0.000263
  3665     rec_error             6395.203125      0 -0.000262
  4523     rec_error             6390.851562      0 -0.000262
  4554     rec_error             6390.457031      0 -0.000262
  4534     rec_error             6390.125000      0 -0.000262},
 'rec_error': {'phi': -0.03370519856757869,
  'top_positive':      cvg_criterion  ram_mem_used_mb_median  dummy   contrib
  3721     rec_error             6397.957031  

#### CP - normalize_factors


In [228]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='normalize_factors',
    num_col='ram_mem_used_mb_median'
)

{'False': {'phi': -0.33230547850463893,
  'top_positive':       normalize_factors  ram_mem_used_mb_median  dummy   contrib
  3665              False             6395.203125      1  0.000266
  3923              False             6369.175781      1  0.000262
  3925              False             6368.210938      1  0.000262
  3914              False             6367.945312      1  0.000262
  3924              False             6367.183594      1  0.000262,
  'top_negative':       normalize_factors  ram_mem_used_mb_median  dummy   contrib
  3721               True             6397.957031      0 -0.000291
  3670               True             6397.003906      0 -0.000291
  3669               True             6396.601562      0 -0.000291
  4523               True             6390.851562      0 -0.000290
  4554               True             6390.457031      0 -0.000290},
 'True': {'phi': 0.33230547850463893,
  'top_positive':       normalize_factors  ram_mem_used_mb_median  dummy   contrib


In [229]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='normalize_factors',
    num_col='gpu_allocated_memory_used_mb_median'
)

{'False': {'phi': 0.07026679597754777,
  'top_positive':      normalize_factors  gpu_allocated_memory_used_mb_median  dummy  contrib
  311              False                          4141.236816      1  0.00032
  316              False                          4141.236816      1  0.00032
  317              False                          4141.236816      1  0.00032
  322              False                          4141.236816      1  0.00032
  323              False                          4141.236816      1  0.00032,
  'top_negative':       normalize_factors  gpu_allocated_memory_used_mb_median  dummy  contrib
  1048               True                          4140.916016      0 -0.00035
  1054               True                          4140.916016      0 -0.00035
  1060               True                          4140.916016      0 -0.00035
  1047               True                          4140.550293      0 -0.00035
  1051               True                          4140.550293   

In [230]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='normalize_factors',
    num_col='frobenius_error_median'
)

{'False': {'phi': -0.060443867796126184,
  'top_positive':      normalize_factors  frobenius_error_median  dummy   contrib
  476              False             1149.252892      1  0.000294
  477              False             1149.252892      1  0.000294
  478              False             1149.252892      1  0.000294
  479              False             1149.252892      1  0.000294
  480              False             1149.252892      1  0.000294,
  'top_negative':       normalize_factors  frobenius_error_median  dummy   contrib
  1166               True             9198.693085      0 -0.002741
  1167               True             9198.693085      0 -0.002741
  1168               True             9198.693085      0 -0.002741
  1169               True             9198.693085      0 -0.002741
  1170               True             9198.693085      0 -0.002741},
 'True': {'phi': 0.060443867796126184,
  'top_positive':       normalize_factors  frobenius_error_median  dummy   contrib
  11

In [231]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='normalize_factors',
    num_col='duration_median'
)

{'False': {'phi': 0.13624378539214194,
  'top_positive':      normalize_factors  duration_median  dummy   contrib
  678              False         4.583791      1  0.000687
  694              False         4.574972      1  0.000686
  700              False         4.569792      1  0.000685
  651              False         4.567359      1  0.000684
  648              False         4.567166      1  0.000684,
  'top_negative':       normalize_factors  duration_median  dummy   contrib
  1287               True         4.746648      0 -0.000785
  1290               True         4.726023      0 -0.000781
  1297               True         4.714066      0 -0.000778
  1289               True         4.701860      0 -0.000776
  1300               True         4.697951      0 -0.000775},
 'True': {'phi': -0.13624378539214194,
  'top_positive':       normalize_factors  duration_median  dummy   contrib
  1287               True         4.746648      1  0.000785
  1290               True         4.7

In [232]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='normalize_factors',
    num_col='duration_median'
)

{'False': {'phi': 0.13624378539214194,
  'top_positive':      normalize_factors  duration_median  dummy   contrib
  678              False         4.583791      1  0.000687
  694              False         4.574972      1  0.000686
  700              False         4.569792      1  0.000685
  651              False         4.567359      1  0.000684
  648              False         4.567166      1  0.000684,
  'top_negative':       normalize_factors  duration_median  dummy   contrib
  1287               True         4.746648      0 -0.000785
  1290               True         4.726023      0 -0.000781
  1297               True         4.714066      0 -0.000778
  1289               True         4.701860      0 -0.000776
  1300               True         4.697951      0 -0.000775},
 'True': {'phi': -0.13624378539214194,
  'top_positive':       normalize_factors  duration_median  dummy   contrib
  1287               True         4.746648      1  0.000785
  1290               True         4.7

#### CP - svd

In [233]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='svd',
    num_col='duration_median'
)

{'randomized_svd': {'phi': -0.00037891278969516833,
  'top_positive':                  svd  duration_median  dummy   contrib
  1297  randomized_svd         4.714066      1  0.001053
  1300  randomized_svd         4.697951      1  0.001048
  1294  randomized_svd         4.695558      1  0.001048
  1288  randomized_svd         4.693097      1  0.001047
  1291  randomized_svd         4.685614      1  0.001045,
  'top_negative':                 svd  duration_median  dummy   contrib
  1287     symeig_svd         4.746648      0 -0.000531
  1290     symeig_svd         4.726023      0 -0.000528
  1289  truncated_svd         4.701860      0 -0.000525
  1293     symeig_svd         4.686834      0 -0.000523
  1296     symeig_svd         4.673222      0 -0.000521},
 'symeig_svd': {'phi': 0.0007453236872385556,
  'top_positive':              svd  duration_median  dummy   contrib
  1287  symeig_svd         4.746648      1  0.001062
  1290  symeig_svd         4.726023      1  0.001056
  1293  symeig

In [234]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='svd',
    num_col='frobenius_error_median'
)

{'randomized_svd': {'phi': 1.718198711412557e-18,
  'top_positive':                  svd  frobenius_error_median  dummy   contrib
  1168  randomized_svd             9198.693085      1  0.003708
  1171  randomized_svd             9198.693085      1  0.003708
  1174  randomized_svd             9198.693085      1  0.003708
  1177  randomized_svd             9198.693085      1  0.003708
  1180  randomized_svd             9198.693085      1  0.003708,
  'top_negative':                 svd  frobenius_error_median  dummy   contrib
  1166  truncated_svd             9198.693085      0 -0.001854
  1167     symeig_svd             9198.693085      0 -0.001854
  1169  truncated_svd             9198.693085      0 -0.001854
  1170     symeig_svd             9198.693085      0 -0.001854
  1172  truncated_svd             9198.693085      0 -0.001854},
 'symeig_svd': {'phi': -6.608456582355988e-19,
  'top_positive':              svd  frobenius_error_median  dummy   contrib
  1167  symeig_svd            

In [235]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='svd',
    num_col='ram_mem_used_mb_median'
)

{'randomized_svd': {'phi': -0.004351577867201845,
  'top_positive':                  svd  ram_mem_used_mb_median  dummy   contrib
  4534  randomized_svd             6390.125000      1  0.000392
  4570  randomized_svd             6389.765625      1  0.000392
  4561  randomized_svd             6389.722656      1  0.000392
  4537  randomized_svd             6389.710938      1  0.000392
  4564  randomized_svd             6389.613281      1  0.000392,
  'top_negative':                  svd  ram_mem_used_mb_median  dummy   contrib
  3649  randomized_svd             2801.968750      1 -0.000352
  3648  randomized_svd             2816.699219      1 -0.000349
  3620  randomized_svd             2826.957031      1 -0.000347
  3614  randomized_svd             2828.257812      1 -0.000346
  3617  randomized_svd             2845.109375      1 -0.000343},
 'symeig_svd': {'phi': 0.003708689838305212,
  'top_positive':              svd  ram_mem_used_mb_median  dummy   contrib
  3721  symeig_svd        

In [236]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    cat_col='svd',
    num_col='gpu_allocated_memory_used_mb_median'
)

{'randomized_svd': {'phi': 3.5429948862531016e-06,
  'top_positive':                  svd  gpu_allocated_memory_used_mb_median  dummy   contrib
  316   randomized_svd                          4141.236816      1  0.000474
  322   randomized_svd                          4141.236816      1  0.000474
  1048  randomized_svd                          4140.916016      1  0.000474
  1054  randomized_svd                          4140.916016      1  0.000474
  1060  randomized_svd                          4140.916016      1  0.000474,
  'top_negative':                svd  gpu_allocated_memory_used_mb_median  dummy   contrib
  311  truncated_svd                          4141.236816      0 -0.000237
  317  truncated_svd                          4141.236816      0 -0.000237
  323  truncated_svd                          4141.236816      0 -0.000237
  252     symeig_svd                          4141.236328      0 -0.000237
  258     symeig_svd                          4141.236328      0 -0.000237},
 '

#### CP - tol

In [237]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='tol',
    col_y='ram_mem_used_mb_median'
)

{'corr': 0.004554488372871987,
 'top_positive':           tol  ram_mem_used_mb_median   contrib
 3670  0.00001             6397.003906  0.000554
 4561  0.00001             6389.722656  0.000552
 4560  0.00001             6389.546875  0.000552
 4559  0.00001             6389.476562  0.000552
 4529  0.00001             6389.195312  0.000552,
 'top_negative':           tol  ram_mem_used_mb_median   contrib
 3616  0.00001             2828.273438 -0.000487
 3615  0.00001             2830.304688 -0.000487
 3617  0.00001             2845.109375 -0.000483
 3600  0.00001             2874.109375 -0.000474
 3601  0.00001             2874.804688 -0.000474}

In [238]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='tol',
    col_y='duration_median'
)

{'corr': -0.01847774360372451,
 'top_positive':           tol  duration_median   contrib
 1290  0.00001         4.726023  0.001487
 1289  0.00001         4.701860  0.001477
 1291  0.00001         4.685614  0.001471
 1169  0.00001         4.636090  0.001451
 1006  0.00001         4.633627  0.001450,
 'top_negative':                tol  duration_median   contrib
 1287  1.000000e-08         4.746648 -0.000425
 1300  1.000000e-09         4.697951 -0.000421
 1288  1.000000e-08         4.693097 -0.000419
 1298  1.000000e-09         4.671779 -0.000418
 1299  1.000000e-09         4.656636 -0.000417}

In [239]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='tol',
    col_y='gpu_allocated_memory_used_mb_median'
)

{'corr': -6.692841417084175e-07,
 'top_positive':          tol  gpu_allocated_memory_used_mb_median   contrib
 316  0.00001                          4141.236816  0.000667
 270  0.00001                          4141.236328  0.000667
 330  0.00001                          4141.236328  0.000667
 644  0.00001                          4140.926758  0.000667
 659  0.00001                          4140.926758  0.000667,
 'top_negative':           tol  gpu_allocated_memory_used_mb_median   contrib
 4349  0.00001                           724.539062 -0.000311
 4350  0.00001                           724.539062 -0.000311
 4351  0.00001                           724.539062 -0.000311
 4454  0.00001                           724.539062 -0.000311
 4455  0.00001                           724.539062 -0.000311}

In [240]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='tol',
    col_y='frobenius_error_median'
)

{'corr': 2.5412779620081836e-06,
 'top_positive':           tol  frobenius_error_median   contrib
 1169  0.00001             9198.693085  0.005219
 1170  0.00001             9198.693085  0.005219
 1171  0.00001             9198.693085  0.005219
 1289  0.00001             9198.693085  0.005219
 1290  0.00001             9198.693085  0.005219,
 'top_negative':                tol  frobenius_error_median   contrib
 1178  1.000000e-09             9198.693085 -0.001491
 1179  1.000000e-09             9198.693085 -0.001491
 1180  1.000000e-09             9198.693085 -0.001491
 1298  1.000000e-09             9198.693085 -0.001491
 1299  1.000000e-09             9198.693085 -0.001491}

#### CP - l2_reg

In [241]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='l2_reg',
    col_y='gpu_allocated_memory_used_mb_median'
)

{'corr': -0.019136460084788014,
 'top_positive':      l2_reg  gpu_allocated_memory_used_mb_median   contrib
 330     1.0                          4141.236328  0.000732
 336     1.0                          4141.236328  0.000732
 327     1.0                          4140.901855  0.000732
 333     1.0                          4140.901855  0.000732
 339     1.0                          4140.901855  0.000732,
 'top_negative':       l2_reg  gpu_allocated_memory_used_mb_median   contrib
 4121     1.0                            725.04248 -0.000341
 4122     1.0                            725.04248 -0.000341
 4123     1.0                            725.04248 -0.000341
 4124     1.0                            725.04248 -0.000341
 4125     1.0                            725.04248 -0.000341}

In [242]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='l2_reg',
    col_y='ram_mem_used_mb_median'
)

{'corr': 0.03339135120375764,
 'top_positive':       l2_reg  ram_mem_used_mb_median   contrib
 4554     1.0             6390.457031  0.000606
 4555     1.0             6389.386719  0.000605
 4541     1.0             6388.761719  0.000605
 4553     1.0             6388.359375  0.000605
 4549     1.0             6388.273438  0.000605,
 'top_negative':       l2_reg  ram_mem_used_mb_median   contrib
 3649     1.0             2801.968750 -0.000543
 3647     1.0             2809.804688 -0.000541
 3648     1.0             2816.699219 -0.000539
 3551     1.0             2921.820312 -0.000505
 3552     1.0             2921.859375 -0.000505}

In [243]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='l2_reg',
    col_y='duration_median'
)

{'corr': -0.041413140447219596,
 'top_positive':      l2_reg  duration_median   contrib
 338     1.0         4.513532  0.001540
 337     1.0         4.485594  0.001527
 332     1.0         4.483313  0.001526
 340     1.0         4.482220  0.001526
 336     1.0         4.480413  0.001525,
 'top_negative':       l2_reg  duration_median   contrib
 1287     0.0         4.746648 -0.000478
 1290     0.0         4.726023 -0.000476
 1297     0.0         4.714066 -0.000474
 1289     0.0         4.701860 -0.000473
 1300     0.0         4.697951 -0.000472}

In [244]:
compute_numeric_numeric_contributions(
    df=dfs_by_decomposition_method['CP'],
    col_x='l2_reg',
    col_y='frobenius_error_median'
)

{'corr': -0.054308640044765516,
 'top_positive':      l2_reg  frobenius_error_median  contrib
 221     0.5              753.678226  0.00015
 222     0.5              753.678226  0.00015
 223     0.5              753.678226  0.00015
 224     0.5              753.678226  0.00015
 225     0.5              753.678226  0.00015,
 'top_negative':       l2_reg  frobenius_error_median   contrib
 1166     0.0             9198.693085 -0.001671
 1167     0.0             9198.693085 -0.001671
 1168     0.0             9198.693085 -0.001671
 1169     0.0             9198.693085 -0.001671
 1170     0.0             9198.693085 -0.001671}

### Tensor Train

In [245]:
mask = dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['eeg-0', 'eeg-1'])
df_tensortrain_eegs = dfs_by_decomposition_method['TensorTrain'].loc[mask]
df_tensortrain_eegs

Unnamed: 0,method_name,decomposition_method,data_type,backend,gpu_allocated_memory_used_mb_min,gpu_allocated_memory_used_mb_median,gpu_allocated_memory_used_mb_mean,gpu_allocated_memory_used_mb_max,ram_mem_used_mb_min,ram_mem_used_mb_median,...,duration_max,frobenius_error_min,frobenius_error_median,frobenius_error_mean,frobenius_error_max,compression_ratio_min,compression_ratio_median,compression_ratio_mean,compression_ratio_max,svd
52,TensorLy_TensorTrain_eeg-0_pytorch_randomized_svd,TensorTrain,eeg-0,pytorch,2711.777344,2711.777344,2711.777344,2711.777344,3048.007812,3048.007812,...,18.317027,0.0204,0.023498,0.023272,0.025882,52.237849,52.237849,52.237849,52.237849,randomized_svd
53,TensorLy_TensorTrain_eeg-0_pytorch_truncated_svd,TensorTrain,eeg-0,pytorch,2267.629395,2268.627441,2268.228223,2268.627441,3135.894531,3135.894531,...,2.495926,0.02618,0.02618,0.02618,0.02618,52.237849,52.237849,52.237849,52.237849,truncated_svd
54,TensorLy_TensorTrain_eeg-1_pytorch_randomized_svd,TensorTrain,eeg-1,pytorch,3940.248047,3940.615234,3940.541797,3940.615234,3192.355469,3192.421875,...,32.316716,0.027143,0.027377,0.027353,0.027484,49.941666,49.941666,49.941666,49.941666,randomized_svd
55,TensorLy_TensorTrain_eeg-1_pytorch_truncated_svd,TensorTrain,eeg-1,pytorch,3529.966309,3529.966309,3530.094727,3530.560547,3239.15625,3239.15625,...,49.53344,0.049401,0.049401,0.049401,0.049401,49.941666,49.941666,49.941666,49.941666,truncated_svd


#### TT - svd

In [252]:
dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])]

Unnamed: 0,method_name,decomposition_method,data_type,backend,gpu_allocated_memory_used_mb_min,gpu_allocated_memory_used_mb_median,gpu_allocated_memory_used_mb_mean,gpu_allocated_memory_used_mb_max,ram_mem_used_mb_min,ram_mem_used_mb_median,...,duration_max,frobenius_error_min,frobenius_error_median,frobenius_error_mean,frobenius_error_max,compression_ratio_min,compression_ratio_median,compression_ratio_mean,compression_ratio_max,svd
40,TensorLy_TensorTrain_image-0_pytorch_randomize...,TensorTrain,image-0,pytorch,14.269531,14.499023,14.412988,14.527832,2853.414062,2853.496094,...,0.120884,0.91591,0.916665,0.917253,0.919649,50.119146,50.119146,50.119146,50.119146,randomized_svd
41,TensorLy_TensorTrain_image-0_pytorch_truncated...,TensorTrain,image-0,pytorch,13.896973,14.126465,14.046191,14.155273,2854.957031,2854.957031,...,0.186796,0.893109,0.893109,0.893109,0.893109,50.119146,50.119146,50.119146,50.119146,truncated_svd
42,TensorLy_TensorTrain_image-1_pytorch_randomize...,TensorTrain,image-1,pytorch,15.375488,15.492188,15.589648,15.998047,2863.441406,2863.441406,...,0.073096,3.460481,3.461798,3.462046,3.463684,50.22993,50.22993,50.22993,50.22993,randomized_svd
43,TensorLy_TensorTrain_image-1_pytorch_truncated...,TensorTrain,image-1,pytorch,10.727051,10.727051,10.821094,11.153809,2863.460938,2863.464844,...,0.062781,3.427547,3.427547,3.427547,3.427547,50.22993,50.22993,50.22993,50.22993,truncated_svd
44,TensorLy_TensorTrain_image-2_pytorch_randomize...,TensorTrain,image-2,pytorch,40.413086,40.555176,40.670508,41.416016,2871.261719,2871.261719,...,0.10743,0.000179,0.000226,0.000248,0.000401,50.006579,50.006579,50.006579,50.006579,randomized_svd
45,TensorLy_TensorTrain_image-2_pytorch_truncated...,TensorTrain,image-2,pytorch,33.42627,33.42627,33.42627,33.42627,2871.28125,2871.28125,...,0.14634,0.010288,0.010288,0.010288,0.010288,50.006579,50.006579,50.006579,50.006579,truncated_svd
4766,T3F_TensorTrain_image-0,TensorTrain,image-0,tensorflow,0.0,1.341797,15.037891,70.207275,6682.027344,6682.605469,...,1.311289,0.89307,0.89307,0.89307,0.89307,50.119146,50.119146,50.119146,50.119146,T3F implementation
4767,T3F_TensorTrain_image-1,TensorTrain,image-1,tensorflow,0.0,0.0,2.919287,11.025391,6684.035156,6684.046875,...,0.521229,3.427543,3.427543,3.427543,3.427543,50.22993,50.22993,50.22993,50.22993,T3F implementation
4768,T3F_TensorTrain_image-2,TensorTrain,image-2,tensorflow,0.0,0.0,23.701709,109.085938,6698.445312,6705.917969,...,0.505828,0.000883,0.000883,0.000883,0.000883,50.006579,50.006579,50.006579,50.006579,T3F implementation


In [253]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])],
    cat_col='svd',
    num_col='duration_median'
)

{'T3F implementation': {'phi': 0.9785047618392354,
  'top_positive':                      svd  duration_median  dummy   contrib
  4766  T3F implementation         0.487804      1  0.281536
  4768  T3F implementation         0.481683      1  0.275246
  4767  T3F implementation         0.386177      1  0.177097
  42        randomized_svd         0.061440      0  0.078315
  43         truncated_svd         0.061929      0  0.078064,
  'top_negative':                svd  duration_median  dummy   contrib
  45   truncated_svd         0.133861      0  0.041102
  41   truncated_svd         0.126492      0  0.044889
  44  randomized_svd         0.099602      0  0.058706
  40  randomized_svd         0.085671      0  0.065864
  43   truncated_svd         0.061929      0  0.078064},
 'randomized_svd': {'phi': -0.5410258179355107,
  'top_positive':                      svd  duration_median  dummy   contrib
  43         truncated_svd         0.061929      0  0.078064
  41         truncated_svd      

In [254]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])],
    cat_col='svd',
    num_col='ram_mem_used_mb_median'
)

{'T3F implementation': {'phi': 0.9999891905606481,
  'top_positive':                      svd  ram_mem_used_mb_median  dummy   contrib
  4768  T3F implementation             6705.917969      1  0.251473
  4767  T3F implementation             6684.046875      1  0.249330
  4766  T3F implementation             6682.605469      1  0.249189
  40        randomized_svd             2853.496094      0  0.062964
  41         truncated_svd             2854.957031      0  0.062892,
  'top_negative':                svd  ram_mem_used_mb_median  dummy   contrib
  45   truncated_svd             2871.281250      0  0.062093
  44  randomized_svd             2871.261719      0  0.062094
  43   truncated_svd             2863.464844      0  0.062476
  42  randomized_svd             2863.441406      0  0.062477
  41   truncated_svd             2854.957031      0  0.062892},
 'randomized_svd': {'phi': -0.500092814968856,
  'top_positive':                      svd  ram_mem_used_mb_median  dummy   contrib
  4

In [256]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])],
    cat_col='svd',
    num_col='frobenius_error_median'
)

{'T3F implementation': {'phi': -0.0035990446561479494,
  'top_positive':                      svd  frobenius_error_median  dummy   contrib
  4767  T3F implementation                3.427543      1  0.240552
  44        randomized_svd                0.000226      0  0.087956
  45         truncated_svd                0.010288      0  0.087344
  41         truncated_svd                0.893109      0  0.033707
  40        randomized_svd                0.916665      0  0.032276,
  'top_negative':                      svd  frobenius_error_median  dummy   contrib
  4768  T3F implementation                0.000883      1 -0.175831
  42        randomized_svd                3.461798      0 -0.122357
  43         truncated_svd                3.427547      0 -0.120276
  4766  T3F implementation                0.893070      1 -0.067419
  40        randomized_svd                0.916665      0  0.032276},
 'randomized_svd': {'phi': 0.005667238353589703,
  'top_positive':                      svd  f

In [257]:
compute_categorical_numeric_contributions(
    df=dfs_by_decomposition_method['TensorTrain'].loc[dfs_by_decomposition_method['TensorTrain']['data_type'].isin(['image-0', 'image-1', 'image-2'])],
    cat_col='svd',
    num_col='gpu_allocated_memory_used_mb_median'
)

{'T3F implementation': {'phi': -0.7328287231764762,
  'top_positive':                svd  gpu_allocated_memory_used_mb_median  dummy   contrib
  43   truncated_svd                            10.727051      0  0.024418
  41   truncated_svd                            14.126465      0  0.002200
  40  randomized_svd                            14.499023      0 -0.000235
  42  randomized_svd                            15.492188      0 -0.006726
  45   truncated_svd                            33.426270      0 -0.123938,
  'top_negative':                      svd  gpu_allocated_memory_used_mb_median  dummy   contrib
  4767  T3F implementation                             0.000000      1 -0.189054
  4768  T3F implementation                             0.000000      1 -0.189054
  4766  T3F implementation                             1.341797      1 -0.171514
  44        randomized_svd                            40.555176      0 -0.170530
  45         truncated_svd                            33.426