чтение и анализ логов бэнчмарка

# Импорты

In [1]:
import re
from pathlib import Path
from pprint import pprint
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly import colors
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from src.utils.read_logs import LogReader

In [2]:
plt.ioff()
%matplotlib notebook

# Чтение логов

In [3]:
log_file_path = "logs/2025-03-10 method_logs.json"
logs = LogReader.load_logs_from_file(log_file_path)

In [4]:
len(logs)

10158

In [5]:
logs[0]

{'method_name': 'TensorLy_Tucker_image-0_pytorch_truncated_svd_svd',
 'method_args': {'rank': [252, 255, 3],
  'n_iter_max': 100,
  'init': 'svd',
  'svd': 'truncated_svd',
  'random_state': 42},
 'qualitative_metrics': {'Language': 'Python',
  'Library': 'TensorLy',
  'TensorLy backend': 'pytorch',
  'Tensor type': 'Dense',
  'Data type': 'image-0',
  'Platform': 'CPU, GPU',
  'Decomposition method': 'Tucker'},
 'quantitative_metrics': {'gpu_allocated_memory_used_mb': [9.95263671875,
   0.0,
   0.0,
   0.0,
   0.0],
  'gpu_cached_memory_used_mb': [30.0, 6.0, 8.0, 26.0, 4.0],
  'ram_mem_used_mb': [6237.703125,
   6238.25,
   6238.25,
   6238.25390625,
   6238.25390625],
  'duration': [4.4722535610198975,
   2.7524054050445557,
   1.7934534549713135,
   1.7476089000701904,
   2.4861092567443848],
  'frobenius_error': [0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604,
   0.6721850484609604],
  'compression_ratio': [50.166930737890446,
   50.16693073

In [6]:
clean_rows, anom_rows, error_rows = [], [], []
neg_stats = defaultdict(lambda: {"idx": [], "vals": []})  # статистика по аномалиям

metric_keys = [  # порядок важен для синхронного среза
    "gpu_allocated_memory_used_mb",
    "ram_mem_used_mb",
    "duration",
    "frobenius_error",
    "compression_ratio",
]

for log in logs:
    # --- базовая (qualitative) часть ---------------------------------
    base = {
        "method_name": log["method_name"],
        "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
        "data_type": log["qualitative_metrics"]["Data type"],
        "language": log["qualitative_metrics"]["Language"],
        "library": log["qualitative_metrics"]["Library"],
        "tensor_type": log["qualitative_metrics"]["Tensor type"],
        "platform": log["qualitative_metrics"]["Platform"],
        "backend": log["qualitative_metrics"][
            next(k for k in log["qualitative_metrics"] if "backend" in k)
        ],
    }

    # ----- случаи с ошибкой выполнения --------------------------------
    if log["error_message"]:
        error_rows.append({**base, "error_message": log["error_message"]})
        continue

    q = log["quantitative_metrics"]
    n_runs = max(len(q[k]) for k in metric_keys)  # обычно 5

    # 1. определяем, какие индексы аномальны
    bad_idx = {i for k in metric_keys
               for i, v in enumerate(q[k])
               if isinstance(v, (int, float)) and v < 0}

    # 2. «чистые» и «плохие» индексы
    good_idx = sorted(set(range(n_runs)) - bad_idx)
    bad_idx = sorted(bad_idx)


    # функция-хелпер: формирует агрегаты для выбранного подмножества индексов
    def make_row(idx_subset, tag):
        if not idx_subset:
            return None  # нечего добавлять
        row = {**base,
               "run_type": tag,  # clean | anomaly
               "original_run_indices": idx_subset}  # список позиций
        for k in metric_keys:
            vals = [q[k][i] for i in idx_subset]
            row[f"{k}_min"] = np.min(vals)
            row[f"{k}_max"] = np.max(vals)
            row[f"{k}_mean"] = np.mean(vals)
            row[f"{k}_median"] = np.median(vals)
        return row


    # 3. сохраняем строки
    r_clean = make_row(good_idx, "clean")
    r_anom = make_row(bad_idx, "anomaly")

    if r_clean: clean_rows.append(r_clean)
    if r_anom:
        anom_rows.append(r_anom)
        # для общей статистики: сколько раз каждая позиция «проваливается»
        for k in metric_keys:
            neg_stats[k]['idx'].extend([i for i in bad_idx
                                        if isinstance(q[k][i], (int, float)) and q[k][i] < 0])
            neg_stats[k]['vals'].extend([q[k][i] for i in bad_idx
                                         if isinstance(q[k][i], (int, float)) and q[k][i] < 0])

# --- превращаем во фреймы --------------------------------------------
df_clean_logs = pd.DataFrame(clean_rows)
df_anomaly_logs = pd.DataFrame(anom_rows)
df_logs_errors = pd.DataFrame(error_rows)

In [7]:
print("metric                         | count_neg |    min   |   mean  |   max   | median ")
print("-" * 100)

for metric, d in neg_stats.items():
    if not d["vals"]:
        continue

    print(
        f"{metric:<30} | "
        f"{len(d['vals']):>9} | "
        f"{np.min(d['vals']):^8.4f} | "
        f"{np.mean(d['vals']):>6.4f} | "
        f"{np.max(d['vals']):>6.4f} | "
        f"{np.median(d['vals']):>6.4f}"
    )


metric                         | count_neg |    min   |   mean  |   max   | median 
----------------------------------------------------------------------------------------------------
gpu_allocated_memory_used_mb   |      1516 | -43.6816 | -0.5823 | -0.0068 | -0.4434
duration                       |        68 | -1.0781  | -0.4767 | -0.0128 | -0.4800


In [8]:
# log_data = []
# log_errors = []
# 
# negative_stats = defaultdict(lambda: {
#     "count": 0,
#     "min": float("inf"),
#     "max": float("-inf"),
#     "sum": 0.0
# })
# 
# for log in logs:
#     if log["error_message"] == "":
#         try:
#             data_entry = {
#                 # Method name
#                 "method_name": log["method_name"],
#                 # Some Qualitative Data
#                 "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
#                 "data_type": log["qualitative_metrics"]["Data type"],
#                 "language": log["qualitative_metrics"]["Language"],
#                 "library": log["qualitative_metrics"]["Library"],
#                 "tensor_type": log["qualitative_metrics"]["Tensor type"],
#                 "platform": log["qualitative_metrics"]["Platform"],
#                 "backend": log["qualitative_metrics"].get(
#                     next(key for key in log["qualitative_metrics"] if "backend" in key)
#                 ),
#                 # GPU Allocated Memory
#                 "gpu_allocated_memory_used_min": np.min(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used_max": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_allocated_memory_used": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]
#                 ),
#                 # GPU Cached Memory
#                 "gpu_cached_memory_used_min": np.min(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used_max": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 "gpu_cached_memory_used": np.max(
#                     [e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]
#                 ),
#                 # RAM Memory Usage
#                 "ram_mem_used_min": np.min([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used_max": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used_mean": np.mean([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 "ram_mem_used": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
#                 # Duration
#                 "duration_min": np.min([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration_max": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration_mean": np.mean([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 "duration": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
#                 # Frobenius Error
#                 "frobenius_error_min": np.min([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 "frobenius_error_max": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 "frobenius_error_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]
#                 ),
#                 "frobenius_error": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
#                 # Compression Ratio
#                 "compression_ratio_min": np.min(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio_max": np.max(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio_mean": np.mean(
#                     [e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]
#                 ),
#                 "compression_ratio": np.max([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
#             }
#             log_data.append(data_entry)
# 
#             for key, values in log["quantitative_metrics"].items():
#                 if not isinstance(values, list):
#                     continue
# 
#                 neg_values = [v for v in values if isinstance(v, (int, float)) and v < 0]
#                 if neg_values:
#                     print(f"[{log['method_name']}] Key '{key}' has negative values: {neg_values}")
# 
#                     negative_stats[key]["count"] += len(neg_values)
#                     negative_stats[key]["min"] = min(negative_stats[key]["min"], min(neg_values))
#                     negative_stats[key]["max"] = max(negative_stats[key]["max"], max(neg_values))
#                     negative_stats[key]["sum"] += sum(neg_values)
# 
# 
#         except Exception as e:
#             print(f"Exception with {log['method_name']}. Error: {e!s}")
#     else:
#         data_entry = {
#             # method name
#             "method_name": log["method_name"],
#             # qualitative metrics
#             "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
#             "data_type": log["qualitative_metrics"]["Data type"],
#             "language": log["qualitative_metrics"]["Language"],
#             "library": log["qualitative_metrics"]["Library"],
#             "tensor_type": log["qualitative_metrics"]["Tensor type"],
#             "platform": log["qualitative_metrics"]["Platform"],
#             "backend": log["qualitative_metrics"].get(
#                 next(key for key in log["qualitative_metrics"] if "backend" in key)
#             ),
#             # Error data
#             "error_message": log["error_message"],
#         }
#         log_errors.append(data_entry)
# 
# df_logs = pd.DataFrame(log_data)
# df_logs_errors = pd.DataFrame(log_errors)

In [9]:
# print("\n=== Summary of Negative Values in Quantitative Metrics ===")
# for key, stats in negative_stats.items():
#     count = stats["count"]
#     if count > 0:
#         avg = stats["sum"] / count
#         print(f"Key: {key}")
#         print(f"  → Count   : {count}")
#         print(f"  → Min     : {stats['min']}")
#         print(f"  → Max     : {stats['max']}")
#         print(f"  → Mean    : {avg:.4f}\n")

In [10]:
df_clean_logs.shape

(4772, 30)

In [11]:
for col in df_clean_logs.columns:
    try:
        unique_vals = df_clean_logs[col].unique()
        print(f"Column: {col}")
        print(f"Unique values ({len(unique_vals)}): {unique_vals}")
        print("-" * 60)
    except TypeError as e:
        print(f"Skipped column '{col}' due to unhashable data type: {e}")

Column: method_name
Unique values (2417): ['TensorLy_Tucker_image-0_pytorch_truncated_svd_svd'
 'TensorLy_Tucker_image-0_pytorch_truncated_svd_random'
 'TensorLy_Tucker_image-0_pytorch_symeig_svd_random' ...
 'TensorLy_CP_image-1_pytorch_truncated_svd_random_True_True_rec_error_1.0_1e-09'
 'TensorLy_CP_image-1_pytorch_symeig_svd_random_True_True_rec_error_1.0_1e-09'
 'TensorLy_CP_image-1_pytorch_randomized_svd_random_True_True_rec_error_1.0_1e-09']
------------------------------------------------------------
Column: decomposition_method
Unique values (3): ['Tucker' 'TensorTrain' 'CP']
------------------------------------------------------------
Column: data_type
Unique values (8): ['image-0' 'image-1' 'image-2' 'video-0' 'video-1' 'video-2' 'eeg-0'
 'eeg-1']
------------------------------------------------------------
Column: language
Unique values (1): ['Python']
------------------------------------------------------------
Column: library
Unique values (2): ['TensorLy' 'T3F']
--------

In [12]:
df_clean_logs

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,run_type,original_run_indices,...,duration_mean,duration_median,frobenius_error_min,frobenius_error_max,frobenius_error_mean,frobenius_error_median,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio_median
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.650366,2.486109,0.672185,0.672185,0.672185,0.672185,50.166931,50.166931,50.166931,50.166931
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.577740,2.555097,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.695470,2.656554,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
3,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,1.336961,1.337399,0.672679,0.672679,0.672679,0.672679,50.166931,50.166931,50.166931,50.166931
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.675934,2.599889,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4767,TensorLy_CP_image-1_pytorch_symeig_svd_random_...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,0.103234,0.070808,21.682340,21.682340,21.682340,21.682340,50.020879,50.020879,50.020879,50.020879
4768,TensorLy_CP_image-1_pytorch_randomized_svd_ran...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,0.091954,0.071715,21.682340,21.682340,21.682340,21.682340,50.020879,50.020879,50.020879,50.020879
4769,TensorLy_CP_image-1_pytorch_truncated_svd_rand...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,0.097102,0.069679,21.682340,21.682340,21.682340,21.682340,50.020879,50.020879,50.020879,50.020879
4770,TensorLy_CP_image-1_pytorch_symeig_svd_random_...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,0.110875,0.072343,21.682340,21.682340,21.682340,21.682340,50.020879,50.020879,50.020879,50.020879


In [13]:
df_anomaly_logs.shape

(1173, 30)

In [14]:
for col in df_anomaly_logs.columns:
    try:
        unique_vals = df_anomaly_logs[col].unique()
        print(f"Column: {col}")
        print(f"Unique values ({len(unique_vals)}): {unique_vals}")
        print("-" * 60)
    except TypeError as e:
        print(f"Skipped column '{col}' due to unhashable data type: {e}")

Column: method_name
Unique values (736): ['TensorLy_Tucker_image-2_pytorch_truncated_svd_svd'
 'TensorLy_Tucker_image-2_pytorch_symeig_svd_random'
 'TensorLy_Tucker_image-2_pytorch_randomized_svd_svd'
 'TensorLy_Tucker_image-2_pytorch_randomized_svd_random'
 'TensorLy_Tucker_video-1_pytorch_truncated_svd_svd'
 'TensorLy_Tucker_video-1_pytorch_truncated_svd_random'
 'TensorLy_Tucker_video-1_pytorch_symeig_svd_random'
 'TensorLy_Tucker_video-1_pytorch_randomized_svd_svd'
 'TensorLy_Tucker_video-1_pytorch_randomized_svd_random'
 'TensorLy_Tucker_video-2_pytorch_truncated_svd_svd'
 'TensorLy_TensorTrain_image-0_pytorch_randomized_svd'
 'TensorLy_TensorTrain_image-0_pytorch_truncated_svd'
 'TensorLy_TensorTrain_image-1_pytorch_truncated_svd'
 'TensorLy_TensorTrain_video-2_pytorch_randomized_svd'
 'TensorLy_TensorTrain_eeg-0_pytorch_randomized_svd'
 'TensorLy_TensorTrain_eeg-0_pytorch_truncated_svd'
 'TensorLy_TensorTrain_eeg-1_pytorch_randomized_svd'
 'T3F_TensorTrain_image-0' 'T3F_TensorTr

In [15]:
df_anomaly_logs

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,run_type,original_run_indices,...,duration_mean,duration_median,frobenius_error_min,frobenius_error_max,frobenius_error_mean,frobenius_error_median,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio_median
0,TensorLy_Tucker_image-2_pytorch_truncated_svd_svd,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,"[1, 3]",...,1.144363,1.144363,0.003966,0.003966,0.003966,0.003966,50.019817,50.019817,50.019817,50.019817
1,TensorLy_Tucker_image-2_pytorch_symeig_svd_random,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,[3],...,2.124535,2.124535,0.004880,0.004880,0.004880,0.004880,50.019817,50.019817,50.019817,50.019817
2,TensorLy_Tucker_image-2_pytorch_randomized_svd...,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,"[1, 2, 4]",...,1.385243,1.390793,0.004437,0.004437,0.004437,0.004437,50.019817,50.019817,50.019817,50.019817
3,TensorLy_Tucker_image-2_pytorch_randomized_svd...,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,"[1, 3, 4]",...,2.113445,2.100399,0.004880,0.004880,0.004880,0.004880,50.019817,50.019817,50.019817,50.019817
4,TensorLy_Tucker_video-1_pytorch_truncated_svd_svd,Tucker,video-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,"[2, 3]",...,0.447350,0.447350,1.242346,1.242346,1.242346,1.242346,50.148401,50.148401,50.148401,50.148401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,TensorLy_CP_image-1_pytorch_symeig_svd_random_...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,[0],...,-0.743756,-0.743756,2.538131,2.538131,2.538131,2.538131,50.020879,50.020879,50.020879,50.020879
1169,TensorLy_CP_image-1_pytorch_randomized_svd_ran...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,[4],...,-0.673038,-0.673038,2.526722,2.526722,2.526722,2.526722,50.020879,50.020879,50.020879,50.020879
1170,TensorLy_CP_image-1_pytorch_randomized_svd_ran...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,[2],...,-0.657496,-0.657496,2.526722,2.526722,2.526722,2.526722,50.020879,50.020879,50.020879,50.020879
1171,TensorLy_CP_image-1_pytorch_truncated_svd_rand...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,anomaly,[1],...,-0.499068,-0.499068,9.004544,9.004544,9.004544,9.004544,50.020879,50.020879,50.020879,50.020879


In [16]:
df_logs_errors.shape

(5386, 9)

In [17]:
for col in df_logs_errors.columns:
    try:
        unique_vals = df_logs_errors[col].unique()
        print(f"Column: {col}")
        print(f"Unique values ({len(unique_vals)}): {unique_vals}")
        print("-" * 60)
    except TypeError as e:
        print(f"Skipped column '{col}' due to unhashable data type: {e}")

Column: method_name
Unique values (2701): ['TensorLy_Tucker_image-0_pytorch_symeig_svd_svd'
 'TensorLy_Tucker_image-1_pytorch_symeig_svd_svd'
 'TensorLy_Tucker_image-2_pytorch_symeig_svd_svd' ...
 'TensorLy_CP_image-1_pytorch_truncated_svd_svd_True_True_rec_error_1.0_1e-09'
 'TensorLy_CP_image-1_pytorch_symeig_svd_svd_True_True_rec_error_1.0_1e-09'
 'TensorLy_CP_image-1_pytorch_randomized_svd_svd_True_True_rec_error_1.0_1e-09']
------------------------------------------------------------
Column: decomposition_method
Unique values (3): ['Tucker' 'TensorTrain' 'CP']
------------------------------------------------------------
Column: data_type
Unique values (8): ['image-0' 'image-1' 'image-2' 'video-0' 'video-1' 'video-2' 'eeg-0'
 'eeg-1']
------------------------------------------------------------
Column: language
Unique values (1): ['Python']
------------------------------------------------------------
Column: library
Unique values (1): ['TensorLy']
-----------------------------------

In [18]:
df_logs_errors

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,error_message
0,TensorLy_Tucker_image-0_pytorch_symeig_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
1,TensorLy_Tucker_image-1_pytorch_symeig_svd_svd,Tucker,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
2,TensorLy_Tucker_image-2_pytorch_symeig_svd_svd,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
3,TensorLy_Tucker_video-0_pytorch_symeig_svd_svd,Tucker,video-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
4,TensorLy_Tucker_video-1_pytorch_symeig_svd_svd,Tucker,video-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
...,...,...,...,...,...,...,...,...,...
5381,TensorLy_CP_image-1_pytorch_symeig_svd_svd_Tru...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."
5382,TensorLy_CP_image-1_pytorch_randomized_svd_svd...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, rand..."
5383,TensorLy_CP_image-1_pytorch_truncated_svd_svd_...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, trun..."
5384,TensorLy_CP_image-1_pytorch_symeig_svd_svd_Tru...,CP,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme..."


## Обработка ошибок

In [19]:
def extract_memory_info(error_message):
    pattern = r"(\d+\.?\d*)\s*(\w+iB)"
    match = re.search(pattern, error_message)

    if match:
        value = float(match.group(1))  # Память
        unit = match.group(2)  # Тип памяти (единица измерения)

        # Конвертация в MiB
        conversion_factors = {
            "KiB": 1 / 1024,  # Килобайты -> Мегабайты
            "MiB": 1,  # Мегабайты -> Мегабайты
            "GiB": 1024,  # Гигабайты -> Мегабайты
            "TiB": 1024 ** 2,  # Терабайты -> Мегабайты
            "PiB": 1024 ** 3,  # Петабайты -> Мегабайты
            "EiB": 1024 ** 4,  # Эксабайты -> Мегабайты
            "ZiB": 1024 ** 5,  # Зеттабайты -> Мегабайты
            "YiB": 1024 ** 6,  # Йоттабайты -> Мегабайты
        }

        if unit in conversion_factors:
            value_in_mib = value * conversion_factors[unit]
        else:
            raise ValueError(f"Неизвестная единица измерения памяти: {unit}")

        return value_in_mib, "MiB"

    return None, None

In [20]:
error_messages_count = 0

for index, row in df_logs_errors.iterrows():
    error_message = row["error_message"]

    if "Tried to allocate" in error_message:  # pytorch
        value, unit = extract_memory_info(error_message)
        if not (value and unit):
            print("Не удалось извлечь информацию о памяти из GPU and RAM ошибки.")
    elif "Unable to allocate" in error_message:  # numpy
        value, unit = extract_memory_info(error_message)
        if not (value and unit):
            print("Не удалось извлечь информацию о памяти из RAM ошибки.")
    else:
        error_messages_count += 1
        print(error_message)

    df_logs_errors.at[index, "memory_tried_to_allocated_error_mb"] = value
print(error_messages_count)

Пропущена итерация из-за ошибки: pytorch, truncated_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, symeig_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, randomized_svd, random, False, False, 1e-08, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, truncated_svd, random, False, False, 1e-05, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, symeig_svd, random, False, False, 1e-05, 0, abs_rec_error. Ошибка: torch.linalg.solve: The solver failed because the input matrix is singular.
Пропущена итерация из-за ошибки: pytorch, randomized_svd, random,

# Функции для анализа логов

In [21]:
# Отрисовка горизонтальных бар-графиков для метрик
def plot_barh(ax, x_data, y_data, title, xlabel, best_value=None, best_label=None, color="green"):
    ax.barh(y_data, x_data, color=color)
    ax.set_title(title, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=12)
    if best_value is not None and best_label is not None:
        ax.axvline(x=best_value, color="red", linestyle="--", label=best_label)
        ax.legend()
    for i in range(len(x_data)):
        ax.text(x_data.iloc[i], i, f"{x_data.iloc[i]:.6f}", va="center")

In [22]:
def get_metrics_data(filtered_df):
    filtered_df_unique_by_method_name = filtered_df["method_name"].unique()
    analysis_data = []

    for method in filtered_df_unique_by_method_name:
        method_specific_data = filtered_df[filtered_df["method_name"] == method]

        compression_ratio = method_specific_data["compression_ratio_median"].to_numpy()[0]
        duration = method_specific_data["duration_median"].to_numpy()[0]
        total_memory = (
                method_specific_data["ram_mem_used_mb_median"].to_numpy()[0]
                + method_specific_data["gpu_allocated_memory_used_mb_median"].to_numpy()[0]
        )
        frobenius_error = method_specific_data["frobenius_error_median"].to_numpy()[0]

        analysis_data.append(
            {
                "method_name": method,
                "duration_median": duration,
                "total_memory_median": total_memory,
                "frobenius_error_median": frobenius_error,
                "compression_ratio_median": compression_ratio,
            }
        )

    return pd.DataFrame(analysis_data)

In [23]:
# Анализ и визуализация методов по количественным показателям в разрезе качественного показателя
def plot_barhs_and_analyze_dfs(df_logs, group_name: str = "decomposition_method"):
    plt.rcParams.update(
        {
            "axes.titlesize": 16,
            "axes.labelsize": 14,
            "xtick.labelsize": 12,
            "ytick.labelsize": 12,
            "legend.fontsize": 12,
        }
    )
    groups = df_logs[group_name].unique()
    analysis_results = {}

    for group in groups:
        filtered_df = df_logs[df_logs[group_name] == group]

        analysis_df = get_metrics_data(filtered_df)

        best_time_method = analysis_df.loc[analysis_df["duration_median"].idxmin()]
        best_memory_method = analysis_df.loc[analysis_df["total_memory_median"].idxmin()]
        best_error_method = analysis_df.loc[analysis_df["frobenius_error_median"].idxmin()]

        analysis_results[group] = {
            "best_time_method": best_time_method.to_dict(),
            "best_memory_method": best_memory_method.to_dict(),
            "best_error_method": best_error_method.to_dict(),
        }
    #
    #     num_methods = len(analysis_df["method_name"])
    #     height_per_method = 0.5
    #     fig_height = max(36, num_methods * height_per_method)
    #     fig_height = 2 ** 16 - 1 if 2 ** 16 < fig_height else fig_height
    #
    #     fig, axes = plt.subplots(4, 1, figsize=(12, fig_height))
    #
    #     plot_barh(
    #         axes[0],
    #         analysis_df["compression_ratio"],
    #         analysis_df["method_name"],
    #         f"Сравнение сжатия по {group_name} - {group}",
    #         "Сжатие (%)",
    #     )
    #
    #     plot_barh(
    #         axes[1],
    #         analysis_df["duration"],
    #         analysis_df["method_name"],
    #         f"Время исполнения по {group_name} - {group}",
    #         "Время (с)",
    #         best_time_method["duration"],
    #         "Лучший метод",
    #         color="skyblue",
    #     )
    #
    #     plot_barh(
    #         axes[2],
    #         analysis_df["total_memory"],
    #         analysis_df["method_name"],
    #         f"Сумма затрачиваемой памяти по {group_name} - {group}",
    #         "Память (МБ)",
    #         best_memory_method["total_memory"],
    #         "Лучший метод",
    #         color="lightgreen",
    #     )
    #
    #     plot_barh(
    #         axes[3],
    #         analysis_df["frobenius_error"],
    #         analysis_df["method_name"],
    #         f"Ошибка Фробениуса по {group_name} - {group}",
    #         "Ошибка (%)",
    #         best_error_method["frobenius_error"],
    #         "Лучший метод",
    #         color="salmon",
    #     )
    #
    #     plt.tight_layout()
    #     plt.subplots_adjust(top=0.95, bottom=0.05, left=0.05, right=0.95)
    #     plt.show()

    return analysis_results

In [24]:
# # Отрисовка горизонтальных бар-графиков для метрик
# def plot_error_bar(ax, data_mean, data_min, data_max, title, ylabel, label):
#     yerr_lower = data_mean - data_min
#     yerr_upper = data_max - data_mean
#
#     yerr_lower = max(yerr_lower, 0)
#     yerr_upper = max(yerr_upper, 0)
#
#     ax.errorbar(
#         [0], [data_mean], yerr=[[yerr_lower], [yerr_upper]], fmt="o",
#         markersize=10, capsize=10, capthick=3, elinewidth=3, color="lightgreen", label=label
#     )
#     ax.set_title(title)
#     ax.set_ylabel(ylabel)
#     ax.set_xticks([0])
#     ax.set_xticklabels([label])
#
#     for val, _lbl in zip([data_mean, data_min, data_max], ["Mean", "Min", "Max"], strict=False):
#         ax.text(0, val, f"{val:.2f}", ha="center", va="bottom", fontsize=10, fontweight="bold")
#
#     ax.axhline(y=data_mean, color="grey", linestyle="--")
#     ax.axhline(y=data_min, color="lightgrey", linestyle=":")
#     ax.axhline(y=data_max, color="lightgrey", linestyle=":")

In [25]:
# # Анализ и визуализация методов по количественным показателям по минимальным, максимальным и средним значениям
# def plot_error_bars(method_data, method):
#     fig, axes = plt.subplots(3, 2, figsize=(16, 16))
#     fig.suptitle(f"Графики для метода: {method}", fontsize=16, fontweight="bold", color="green")
#
#     plot_error_bar(
#         axes[0, 0],
#         method_data["gpu_allocated_memory_used"].to_numpy()[0],
#         method_data["gpu_allocated_memory_used_min"].to_numpy()[0],
#         method_data["gpu_allocated_memory_used_max"].to_numpy()[0],
#         "GPU Allocated Memory",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[0, 1],
#         method_data["gpu_cached_memory_used"].to_numpy()[0],
#         method_data["gpu_cached_memory_used_min"].to_numpy()[0],
#         method_data["gpu_cached_memory_used_max"].to_numpy()[0],
#         "GPU Cached Memory",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[1, 0],
#         method_data["ram_mem_used"].to_numpy()[0],
#         method_data["ram_mem_used_min"].to_numpy()[0],
#         method_data["ram_mem_used_max"].to_numpy()[0],
#         "RAM Memory Usage",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[1, 1],
#         method_data["duration"].to_numpy()[0],
#         method_data["duration_min"].to_numpy()[0],
#         method_data["duration_max"].to_numpy()[0],
#         "Duration",
#         "Time (s)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[2, 0],
#         method_data["frobenius_error"].to_numpy()[0],
#         method_data["frobenius_error_min"].to_numpy()[0],
#         method_data["frobenius_error_max"].to_numpy()[0],
#         "Frobenius Error",
#         "Error (%)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[2, 1],
#         method_data["compression_ratio"].to_numpy()[0],
#         method_data["compression_ratio_min"].to_numpy()[0],
#         method_data["compression_ratio_max"].to_numpy()[0],
#         "Compression Ratio",
#         "Ratio",
#         method,
#     )
#
#     plt.tight_layout(rect=[0, 0, 1, 0.95])
#     plt.show()

# Расчет лучших метрик

## Сравнение методов по типам данных

In [26]:
analysis_results_data_type = plot_barhs_and_analyze_dfs(df_clean_logs, group_name="data_type")

## Сравнение метрик по методам

In [27]:
analysis_results_decomposition_method = plot_barhs_and_analyze_dfs(df_clean_logs, group_name="decomposition_method")

## Сравнение метрик по методам и типам данных

In [28]:
data_types = df_clean_logs["data_type"].unique()

df_logs_by_data_types_dict = {data_type: df_clean_logs[df_clean_logs["data_type"] == data_type] for data_type in
                              data_types}

analysis_results_data_types_decompositions_methods = {
    data_type: plot_barhs_and_analyze_dfs(df_logs_by_data_types_dict[data_type], group_name="decomposition_method")
    for data_type in data_types
}

# Сравнение метрик

## Метрики в разрезе типов данных

In [29]:
pprint(analysis_results_data_type, indent=4)

{   'eeg-0': {   'best_error_method': {   'compression_ratio_median': 52.237848577500216,
                                          'duration_median': 18.23839807510376,
                                          'frobenius_error_median': 0.024129144730977714,
                                          'method_name': 'TensorLy_TensorTrain_eeg-0_pytorch_randomized_svd',
                                          'total_memory_median': 2795.3369140625},
                 'best_memory_method': {   'compression_ratio_median': 52.237848577500216,
                                           'duration_median': 18.23839807510376,
                                           'frobenius_error_median': 0.024129144730977714,
                                           'method_name': 'TensorLy_TensorTrain_eeg-0_pytorch_randomized_svd',
                                           'total_memory_median': 2795.3369140625},
                 'best_time_method': {   'compression_ratio_median': 52.237848577500216,


## Метрики в разрезе методов

In [30]:
pprint(analysis_results_decomposition_method, indent=4)

{   'CP': {   'best_error_method': {   'compression_ratio_median': 50.06507925904784,
                                       'duration_median': 0.7971055507659912,
                                       'frobenius_error_median': 0.06767631857655942,
                                       'method_name': 'TensorLy_CP_image-2_pytorch_truncated_svd_random_False_True_abs_rec_error_1.0_1e-08',
                                       'total_memory_median': 4579.92041015625},
              'best_memory_method': {   'compression_ratio_median': 50.05868249417367,
                                        'duration_median': 0.5024111270904541,
                                        'frobenius_error_median': 0.4365097265690565,
                                        'method_name': 'TensorLy_CP_image-0_pytorch_symeig_svd_random_False_False_abs_rec_error_0_1e-08',
                                        'total_memory_median': 4437.20703125},
              'best_time_method': {   'compression_ratio_me

## Метрики в разрезе типов данных и методов

In [31]:
for data_type, metrics in analysis_results_data_types_decompositions_methods.items():
    print(f"\n{data_type}\n")
    pprint(metrics, indent=4)


image-0

{   'CP': {   'best_error_method': {   'compression_ratio_median': 50.05868249417367,
                                       'duration_median': 0.6631524562835693,
                                       'frobenius_error_median': 0.3912588581442833,
                                       'method_name': 'TensorLy_CP_image-0_pytorch_truncated_svd_random_False_True_abs_rec_error_0.01_1e-08',
                                       'total_memory_median': 6312.28515625},
              'best_memory_method': {   'compression_ratio_median': 50.05868249417367,
                                        'duration_median': 0.5024111270904541,
                                        'frobenius_error_median': 0.4365097265690565,
                                        'method_name': 'TensorLy_CP_image-0_pytorch_symeig_svd_random_False_False_abs_rec_error_0_1e-08',
                                        'total_memory_median': 4437.20703125},
              'best_time_method': {   'compression_r

# Аналитика логов количественных и качественных метрик

## Обработка данных

In [32]:
df_clean_logs.shape

(4772, 30)

In [33]:
df_clean_logs.head()

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,run_type,original_run_indices,...,duration_mean,duration_median,frobenius_error_min,frobenius_error_max,frobenius_error_mean,frobenius_error_median,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio_median
0,TensorLy_Tucker_image-0_pytorch_truncated_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.650366,2.486109,0.672185,0.672185,0.672185,0.672185,50.166931,50.166931,50.166931,50.166931
1,TensorLy_Tucker_image-0_pytorch_truncated_svd_...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.57774,2.555097,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
2,TensorLy_Tucker_image-0_pytorch_symeig_svd_random,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.69547,2.656554,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931
3,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,1.336961,1.337399,0.672679,0.672679,0.672679,0.672679,50.166931,50.166931,50.166931,50.166931
4,TensorLy_Tucker_image-0_pytorch_randomized_svd...,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,clean,"[0, 1, 2, 3, 4]",...,2.675934,2.599889,0.672281,0.672281,0.672281,0.672281,50.166931,50.166931,50.166931,50.166931


In [34]:
df_clean_logs.columns

Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'run_type',
       'original_run_indices', 'gpu_allocated_memory_used_mb_min',
       'gpu_allocated_memory_used_mb_max', 'gpu_allocated_memory_used_mb_mean',
       'gpu_allocated_memory_used_mb_median', 'ram_mem_used_mb_min',
       'ram_mem_used_mb_max', 'ram_mem_used_mb_mean', 'ram_mem_used_mb_median',
       'duration_min', 'duration_max', 'duration_mean', 'duration_median',
       'frobenius_error_min', 'frobenius_error_max', 'frobenius_error_mean',
       'frobenius_error_median', 'compression_ratio_min',
       'compression_ratio_max', 'compression_ratio_mean',
       'compression_ratio_median'],
      dtype='object')

In [35]:
df_filtered_for_analytics = df_clean_logs[
    [
        "method_name",
        "decomposition_method",
        "data_type",
        "backend",
        
        "gpu_allocated_memory_used_mb_min",
        "gpu_allocated_memory_used_mb_median",
        "gpu_allocated_memory_used_mb_mean",
        "gpu_allocated_memory_used_mb_max",
        
        "ram_mem_used_mb_min",
        "ram_mem_used_mb_median",
        "ram_mem_used_mb_mean",
        "ram_mem_used_mb_max",
        
        "duration_min",
        "duration_median",
        "duration_mean",
        "duration_max",
        
        "frobenius_error_min",
        "frobenius_error_median",
        "frobenius_error_mean",
        "frobenius_error_max",
        
        "compression_ratio_min",
        "compression_ratio_median",
        "compression_ratio_mean",
        "compression_ratio_max",
    ]
]


In [36]:
dfs_by_decomposition_method = {
    decomposition_method: df_filtered_for_analytics[
        df_filtered_for_analytics["decomposition_method"] == decomposition_method
        ]
    for decomposition_method in df_filtered_for_analytics["decomposition_method"].unique()
}

In [37]:
for decomposition_method, df_by_decomposition_method in dfs_by_decomposition_method.items():
    enriched_rows = []
    for _, row in df_by_decomposition_method.iterrows():
        matching_log = next((log for log in logs if row["method_name"] == log["method_name"]), None)

        enriched_row = row.copy()

        if row["decomposition_method"] == "Tucker":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "TensorTrain":
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "CP":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
            enriched_row["normalize_factors"] = matching_log["method_args"].get("normalize_factors")
            enriched_row["orthogonalise"] = matching_log["method_args"].get("orthogonalise")
            enriched_row["tol"] = matching_log["method_args"].get("tol")
            enriched_row["l2_reg"] = matching_log["method_args"].get("l2_reg")
            enriched_row["cvg_criterion"] = matching_log["method_args"].get("cvg_criterion")

        enriched_rows.append(enriched_row)

    dfs_by_decomposition_method[decomposition_method] = pd.DataFrame(enriched_rows)

In [38]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Обработка ошибок

In [39]:
df_logs_errors.shape

(5386, 10)

In [40]:
df_logs_errors.head()

Unnamed: 0,method_name,decomposition_method,data_type,language,library,tensor_type,platform,backend,error_message,memory_tried_to_allocated_error_mb
0,TensorLy_Tucker_image-0_pytorch_symeig_svd_svd,Tucker,image-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",385996.8
1,TensorLy_Tucker_image-1_pytorch_symeig_svd_svd,Tucker,image-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",248903.68
2,TensorLy_Tucker_image-2_pytorch_symeig_svd_svd,Tucker,image-2,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",2586040.32
3,TensorLy_Tucker_video-0_pytorch_symeig_svd_svd,Tucker,video-0,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",46653.44
4,TensorLy_Tucker_video-1_pytorch_symeig_svd_svd,Tucker,video-1,Python,TensorLy,Dense,"CPU, GPU",pytorch,"Пропущена итерация из-за ошибки: pytorch, syme...",26245.12


In [41]:
df_logs_errors.columns

Index(['method_name', 'decomposition_method', 'data_type', 'language',
       'library', 'tensor_type', 'platform', 'backend', 'error_message',
       'memory_tried_to_allocated_error_mb'],
      dtype='object')

In [42]:
df_filtered_errors_for_analytics = df_logs_errors[
    [
        "method_name",
        "decomposition_method",
        "data_type",
        "backend",
        
        "memory_tried_to_allocated_error_mb"
    ]
]


In [43]:
dfs_errors_by_decomposition_method = {
    decomposition_method: df_filtered_errors_for_analytics[
        df_filtered_errors_for_analytics["decomposition_method"] == decomposition_method
        ]
    for decomposition_method in df_filtered_errors_for_analytics["decomposition_method"].unique()
}

In [44]:
for decomposition_method, df_by_decomposition_method in dfs_errors_by_decomposition_method.items():
    enriched_rows = []
    for _, row in df_by_decomposition_method.iterrows():
        matching_log = next((log for log in logs if row["method_name"] == log["method_name"]), None)

        enriched_row = row.copy()

        if row["decomposition_method"] == "Tucker":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "TensorTrain":
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "CP":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
            enriched_row["normalize_factors"] = matching_log["method_args"].get("normalize_factors")
            enriched_row["orthogonalise"] = matching_log["method_args"].get("orthogonalise")
            enriched_row["tol"] = matching_log["method_args"].get("tol")
            enriched_row["l2_reg"] = matching_log["method_args"].get("l2_reg")
            enriched_row["cvg_criterion"] = matching_log["method_args"].get("cvg_criterion")

        enriched_rows.append(enriched_row)

    dfs_errors_by_decomposition_method[decomposition_method] = pd.DataFrame(enriched_rows)

In [45]:
dfs_errors_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Методы для графиков

In [46]:
def preprocess_dataframe(df):
    df_encoded = df.copy()
    for column in df_encoded.columns:
        if df_encoded[column].dtype == "object" and column != "method_name":
            df_encoded[column] = LabelEncoder().fit_transform(df_encoded[column])

    scaler = MinMaxScaler()
    df_encoded[df_encoded.columns.difference(["method_name"])] = scaler.fit_transform(
        df_encoded[df_encoded.columns.difference(["method_name"])]
    )
    return df_encoded

In [47]:
def plot_heatmap(df, title):
    folder_path = Path("../.cache/data_analyze/")

    corr_matrix = df.drop(columns=["method_name"]).corr()

    fig = go.Figure(
        data=go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.index,
            colorscale=colors.sequential.RdBu,
            colorbar={
                "title": "Correlation",
                "titlefont": {"color": "black"},
                "tickfont": {"color": "black"},
            },
            text=corr_matrix.values,
            texttemplate="%{text:.2f}",
            textfont={"size": 12, "color": "black"},
            hovertemplate="<b>%{x}</b> vs. <b>%{y}</b> <b>Correlation:</b> %{z:.8f}<extra></extra>",
        )
    )

    fig.update_layout(
        title=f"Heatmap: {title}",
        title_font={"color": "black"},
        xaxis={"tickangle": -45, "tickfont": {"size": 12, "color": "black"}},
        yaxis={"tickfont": {"size": 12, "color": "black"}},
        width=1400,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    folder_path.mkdir(parents=True, exist_ok=True)

    fig.write_html(folder_path / f"heatmap_{title}.html")

In [48]:
def plot_pca(df, title):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pca_df = df.drop(columns=["method_name"])
    pca = PCA(n_components=3)
    pca_components = pca.fit_transform(pca_df)

    pca_df = pd.DataFrame(
        {
            "PCA1": pca_components[:, 0],
            "PCA2": pca_components[:, 1],
            "PCA3": pca_components[:, 2],
            "method_name": df["method_name"].to_numpy(),
        }
    )

    fig1 = px.scatter(
        pca_df, x="PCA1", y="PCA2", color="method_name", title=f"PCA 2D Projection: {title}", hover_name="method_name"
    )
    fig1.update_traces(marker={"size": 10})
    fig1.update_layout(
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        xaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        yaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        showlegend=False,
    )

    fig1.write_html(folder_path / f"pca_2d_{title}.html")

    fig2 = px.scatter_3d(
        pca_df,
        x="PCA1",
        y="PCA2",
        z="PCA3",
        color="method_name",
        title=f"PCA 3D Projection: {title}",
        hover_name="method_name",
    )
    fig2.update_traces(marker={"size": 5})
    fig2.update_layout(
        scene={
            "xaxis_title": "PCA1",
            "yaxis_title": "PCA2",
            "zaxis_title": "PCA3",
            "xaxis_backgroundcolor": "white",
            "yaxis_backgroundcolor": "white",
            "zaxis_backgroundcolor": "white",
            "xaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "yaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "zaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "bgcolor": "white",
        },
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        showlegend=False,
    )

    fig2.write_html(folder_path / f"pca_3d_{title}.html")

In [49]:
def plot_pairplot(df, title, sample_size=1000, num_axes=100):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pairplot_df = df.drop(columns=["method_name"])

    if len(pairplot_df) > sample_size:
        pairplot_df = pairplot_df.sample(n=sample_size, random_state=42)

    selected_axes = pairplot_df.columns[:num_axes]

    sns.set(style="white")
    g = sns.pairplot(pairplot_df[selected_axes], diag_kind="kde", markers="+", plot_kws={"s": 50})
    g.fig.suptitle(f"Pairplot: {title}", y=1.02, fontsize=16, color="black")

    for ax in g.axes.flatten():
        ax.tick_params(axis="x", labelsize=12, labelcolor="black")
        ax.tick_params(axis="y", labelsize=12, labelcolor="black")
        ax.set_facecolor("white")
        ax.set_xticks(ax.get_xticks())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

    plt.savefig(folder_path / f"pairplot_{title}.png", dpi=300, bbox_inches="tight")
    plt.close("all")

## Энкодинг и нормализация датафреймов

In [50]:
dfs_by_decomposition_method_encoded = {}
dfs_errors_by_decomposition_method_encoded = {}

In [51]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

In [52]:
dfs_errors_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

In [53]:
for key, df_by_decomposition_method in dfs_by_decomposition_method.items():
    dfs_by_decomposition_method_encoded[key] = preprocess_dataframe(
        df_by_decomposition_method.drop("decomposition_method", axis=1)
    )

In [54]:
for key, df_by_decomposition_method in dfs_errors_by_decomposition_method.items():
    dfs_errors_by_decomposition_method_encoded[key] = preprocess_dataframe(
        df_by_decomposition_method.drop("decomposition_method", axis=1)
    )

In [55]:
# dfs_by_decomposition_method['CP']

In [56]:
# dfs_by_decomposition_method_encoded['CP']

In [57]:
dfs_by_decomposition_method_encoded.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

# Графики для данных

## Heatmap

In [58]:
method_name = "Tucker_data"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [59]:
method_name = "TensorTrain_data"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [60]:
method_name = "CP_data"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## PCA

In [61]:
method_name = "Tucker_data"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [62]:
method_name = "TensorTrain_data"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [63]:
method_name = "CP_data"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## Pairplot

In [64]:
method_name = "Tucker_data"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>

In [65]:
method_name = "TensorTrain_data"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>

In [66]:
method_name = "CP_data"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>

# Графики для ошибок

## Heatmap

In [67]:
method_name = "Tucker_errors"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [68]:
method_name = "TensorTrain_errors"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [69]:
method_name = "CP_errors"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## PCA

In [70]:
method_name = "Tucker_errors"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [71]:
method_name = "TensorTrain_errors"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [72]:
method_name = "CP_errors"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## Pairplot

In [73]:
method_name = "Tucker_errors"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>

In [74]:
method_name = "TensorTrain_errors"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>

In [75]:
method_name = "CP_errors"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)
None

<IPython.core.display.Javascript object>


KeyboardInterrupt

