# Импорты

In [249]:
from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly import colors
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from src.utils.read_logs import LogReader

In [250]:
%matplotlib notebook

# Чтение логов

In [251]:
log_file_path = "logs/2024-10-25 method_logs.json"
logs = LogReader.load_logs_from_file(log_file_path)

In [252]:
log_data = []

for log in logs:
    data_entry = {
        "method_name": log["method_name"],
        # GPU Allocated Memory
        "gpu_allocated_memory_used_min": np.min([e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]),
        "gpu_allocated_memory_used_max": np.max([e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]),
        "gpu_allocated_memory_used_mean": np.mean([e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]),
        "gpu_allocated_memory_used": np.max([e for e in log["quantitative_metrics"]["gpu_allocated_memory_used_mb"] if e >= 0.0]),
        # GPU Cached Memory
        "gpu_cached_memory_used_min": np.min([e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]),
        "gpu_cached_memory_used_max": np.max([e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]),
        "gpu_cached_memory_used_mean": np.mean([e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]),
        "gpu_cached_memory_used": np.max([e for e in log["quantitative_metrics"]["gpu_cached_memory_used_mb"] if e >= 0.0]),
        # RAM Memory Usage
        "ram_mem_used_min": np.min([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
        "ram_mem_used_max": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
        "ram_mem_used_mean": np.mean([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
        "ram_mem_used": np.max([e for e in log["quantitative_metrics"]["ram_mem_used_mb"] if e >= 0.0]),
        # Duration
        "duration_min": np.min([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
        "duration_max": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
        "duration_mean": np.mean([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
        "duration": np.max([e for e in log["quantitative_metrics"]["duration"] if e >= 0.0]),
        # Frobenius Error
        "frobenius_error_min": np.min([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
        "frobenius_error_max": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
        "frobenius_error_mean": np.mean([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
        "frobenius_error": np.max([e for e in log["quantitative_metrics"]["frobenius_error"] if e >= 0.0]),
        # Compression Ratio
        "compression_ratio_min": np.min([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
        "compression_ratio_max": np.max([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
        "compression_ratio_mean": np.mean([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
        "compression_ratio": np.max([e for e in log["quantitative_metrics"]["compression_ratio"] if e >= 0.0]),
        "decomposition_method": log["qualitative_metrics"]["Decomposition method"],
        "data_type": log["qualitative_metrics"]["Data type"],
        "language": log["qualitative_metrics"]["Language"],
        "library": log["qualitative_metrics"]["Library"],
        "tensor_type": log["qualitative_metrics"]["Tensor type"],
        "platform": log["qualitative_metrics"]["Platform"],
    }
    log_data.append(data_entry)

df_logs = pd.DataFrame(log_data)

In [253]:
methods = df_logs["method_name"].unique()

In [254]:
methods

array(['TensorLy_Tucker_video_pytorch_truncated_svd_svd',
       'TensorLy_Tucker_video_pytorch_truncated_svd_random',
       'TensorLy_Tucker_video_pytorch_symeig_svd_random',
       'TensorLy_Tucker_video_pytorch_randomized_svd_svd',
       'TensorLy_Tucker_video_pytorch_randomized_svd_random',
       'TensorLy_Tucker_video_numpy_truncated_svd_svd',
       'TensorLy_Tucker_video_numpy_truncated_svd_random',
       'TensorLy_Tucker_video_numpy_symeig_svd_random',
       'TensorLy_Tucker_video_numpy_randomized_svd_svd',
       'TensorLy_Tucker_video_numpy_randomized_svd_random',
       'TensorLy_Tucker_image_pytorch_truncated_svd_svd',
       'TensorLy_Tucker_image_pytorch_truncated_svd_random',
       'TensorLy_Tucker_image_pytorch_symeig_svd_random',
       'TensorLy_Tucker_image_pytorch_randomized_svd_svd',
       'TensorLy_Tucker_image_pytorch_randomized_svd_random',
       'TensorLy_Tucker_image_numpy_truncated_svd_svd',
       'TensorLy_Tucker_image_numpy_truncated_svd_random',
 

# Функции для анализа логов

In [255]:
# Отрисовка горизонтальных бар-графиков для метрик
def plot_barh(ax, x_data, y_data, title, xlabel, best_value=None, best_label=None, color="green"):
    ax.barh(y_data, x_data, color=color)
    ax.set_title(title, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=12)
    if best_value is not None and best_label is not None:
        ax.axvline(x=best_value, color="red", linestyle="--", label=best_label)
        ax.legend()
    for i in range(len(x_data)):
        ax.text(x_data.iloc[i], i, f"{x_data.iloc[i]:.6f}", va="center")

In [256]:
def get_metrics_data(filtered_df):
    filtered_df_unique_by_method_name = filtered_df["method_name"].unique()
    analysis_data = []

    for method in filtered_df_unique_by_method_name:
        method_specific_data = filtered_df[filtered_df["method_name"] == method]

        compression_ratio = method_specific_data["compression_ratio"].to_numpy()[0]
        duration = method_specific_data["duration"].to_numpy()[0]
        total_memory = method_specific_data["ram_mem_used"].to_numpy()[0] + method_specific_data["gpu_cached_memory_used"].to_numpy()[0]
        frobenius_error = method_specific_data["frobenius_error"].to_numpy()[0]

        analysis_data.append(
            {
                "method_name": method,
                "duration": duration,
                "total_memory": total_memory,
                "frobenius_error": frobenius_error,
                "compression_ratio": compression_ratio,
            }
        )

    return pd.DataFrame(analysis_data)

In [257]:
# Анализ и визуализация методов по количественным показателям в разрезе качественного показателя
def plot_barhs_and_analyze_dfs(df_logs, group_name: str = "decomposition_method"):
    plt.rcParams.update(
        {
            "axes.titlesize": 16,
            "axes.labelsize": 14,
            "xtick.labelsize": 12,
            "ytick.labelsize": 12,
            "legend.fontsize": 12,
        }
    )
    groups = df_logs[group_name].unique()
    analysis_results = {}

    for group in groups:
        filtered_df = df_logs[df_logs[group_name] == group]

        analysis_df = get_metrics_data(filtered_df)

        best_time_method = analysis_df.loc[analysis_df["duration"].idxmin()]
        best_memory_method = analysis_df.loc[analysis_df["total_memory"].idxmin()]
        best_error_method = analysis_df.loc[analysis_df["frobenius_error"].idxmin()]

        analysis_results[group] = {
            "best_time_method": best_time_method.to_dict(),
            "best_memory_method": best_memory_method.to_dict(),
            "best_error_method": best_error_method.to_dict(),
        }
    #
    #     num_methods = len(analysis_df["method_name"])
    #     height_per_method = 0.5
    #     fig_height = max(36, num_methods * height_per_method)
    #     fig_height = 2 ** 16 - 1 if 2 ** 16 < fig_height else fig_height
    #
    #     fig, axes = plt.subplots(4, 1, figsize=(12, fig_height))
    #
    #     plot_barh(
    #         axes[0],
    #         analysis_df["compression_ratio"],
    #         analysis_df["method_name"],
    #         f"Сравнение сжатия по {group_name} - {group}",
    #         "Сжатие (%)",
    #     )
    #
    #     plot_barh(
    #         axes[1],
    #         analysis_df["duration"],
    #         analysis_df["method_name"],
    #         f"Время исполнения по {group_name} - {group}",
    #         "Время (с)",
    #         best_time_method["duration"],
    #         "Лучший метод",
    #         color="skyblue",
    #     )
    #
    #     plot_barh(
    #         axes[2],
    #         analysis_df["total_memory"],
    #         analysis_df["method_name"],
    #         f"Сумма затрачиваемой памяти по {group_name} - {group}",
    #         "Память (МБ)",
    #         best_memory_method["total_memory"],
    #         "Лучший метод",
    #         color="lightgreen",
    #     )
    #
    #     plot_barh(
    #         axes[3],
    #         analysis_df["frobenius_error"],
    #         analysis_df["method_name"],
    #         f"Ошибка Фробениуса по {group_name} - {group}",
    #         "Ошибка (%)",
    #         best_error_method["frobenius_error"],
    #         "Лучший метод",
    #         color="salmon",
    #     )
    #
    #     plt.tight_layout()
    #     plt.subplots_adjust(top=0.95, bottom=0.05, left=0.05, right=0.95)
    #     plt.show()

    return analysis_results

In [258]:
# # Отрисовка горизонтальных бар-графиков для метрик
# def plot_error_bar(ax, data_mean, data_min, data_max, title, ylabel, label):
#     yerr_lower = data_mean - data_min
#     yerr_upper = data_max - data_mean
#
#     yerr_lower = max(yerr_lower, 0)
#     yerr_upper = max(yerr_upper, 0)
#
#     ax.errorbar(
#         [0], [data_mean], yerr=[[yerr_lower], [yerr_upper]], fmt="o",
#         markersize=10, capsize=10, capthick=3, elinewidth=3, color="lightgreen", label=label
#     )
#     ax.set_title(title)
#     ax.set_ylabel(ylabel)
#     ax.set_xticks([0])
#     ax.set_xticklabels([label])
#
#     for val, _lbl in zip([data_mean, data_min, data_max], ["Mean", "Min", "Max"], strict=False):
#         ax.text(0, val, f"{val:.2f}", ha="center", va="bottom", fontsize=10, fontweight="bold")
#
#     ax.axhline(y=data_mean, color="grey", linestyle="--")
#     ax.axhline(y=data_min, color="lightgrey", linestyle=":")
#     ax.axhline(y=data_max, color="lightgrey", linestyle=":")

In [259]:
# # Анализ и визуализация методов по количественным показателям по минимальным, максимальным и средним значениям
# def plot_error_bars(method_data, method):
#     fig, axes = plt.subplots(3, 2, figsize=(16, 16))
#     fig.suptitle(f"Графики для метода: {method}", fontsize=16, fontweight="bold", color="green")
#
#     plot_error_bar(
#         axes[0, 0],
#         method_data["gpu_allocated_memory_used"].to_numpy()[0],
#         method_data["gpu_allocated_memory_used_min"].to_numpy()[0],
#         method_data["gpu_allocated_memory_used_max"].to_numpy()[0],
#         "GPU Allocated Memory",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[0, 1],
#         method_data["gpu_cached_memory_used"].to_numpy()[0],
#         method_data["gpu_cached_memory_used_min"].to_numpy()[0],
#         method_data["gpu_cached_memory_used_max"].to_numpy()[0],
#         "GPU Cached Memory",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[1, 0],
#         method_data["ram_mem_used"].to_numpy()[0],
#         method_data["ram_mem_used_min"].to_numpy()[0],
#         method_data["ram_mem_used_max"].to_numpy()[0],
#         "RAM Memory Usage",
#         "Memory (MB)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[1, 1],
#         method_data["duration"].to_numpy()[0],
#         method_data["duration_min"].to_numpy()[0],
#         method_data["duration_max"].to_numpy()[0],
#         "Duration",
#         "Time (s)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[2, 0],
#         method_data["frobenius_error"].to_numpy()[0],
#         method_data["frobenius_error_min"].to_numpy()[0],
#         method_data["frobenius_error_max"].to_numpy()[0],
#         "Frobenius Error",
#         "Error (%)",
#         method,
#     )
#
#     plot_error_bar(
#         axes[2, 1],
#         method_data["compression_ratio"].to_numpy()[0],
#         method_data["compression_ratio_min"].to_numpy()[0],
#         method_data["compression_ratio_max"].to_numpy()[0],
#         "Compression Ratio",
#         "Ratio",
#         method,
#     )
#
#     plt.tight_layout(rect=[0, 0, 1, 0.95])
#     plt.show()

# Расчет лучших метрик

## Сравнение методов по типам данных

In [260]:
analysis_results_data_type = plot_barhs_and_analyze_dfs(df_logs, group_name="data_type")

## Сравнение метрик по методам

In [261]:
analysis_results_decomposition_method = plot_barhs_and_analyze_dfs(df_logs, group_name="decomposition_method")

## Сравнение метрик по методам и типам данных

In [262]:
data_types = df_logs["data_type"].unique()

df_logs_by_data_types_dict = {data_type: df_logs[df_logs["data_type"] == data_type] for data_type in data_types}

analysis_results_data_types_decompositions_methods = {
    data_type: plot_barhs_and_analyze_dfs(df_logs_by_data_types_dict[data_type], group_name="decomposition_method") for data_type in data_types
}

# Сравнение метрик

## Метрики в разрезе типов данных

In [263]:
pprint(analysis_results_data_type, indent=4)

{   'image': {   'best_error_method': {   'compression_ratio': 50.177304964539005,
                                          'duration': 440.7263283729553,
                                          'frobenius_error': 0.3439680049630961,
                                          'method_name': 'TensorLy_CP_image_numpy_truncated_svd_random_False_False_abs_rec_error_0_1e-08',
                                          'total_memory': 2577.8046875},
                 'best_memory_method': {   'compression_ratio': 50.177304964539005,
                                           'duration': 6.756335496902466,
                                           'frobenius_error': 0.46598981134593487,
                                           'method_name': 'TensorLy_CP_image_pytorch_truncated_svd_random_False_False_abs_rec_error_0_1e-08',
                                           'total_memory': 528.0234375},
                 'best_time_method': {   'compression_ratio': 50.119146421206175,
             

## Метрики в разрезе методов

In [264]:
pprint(analysis_results_decomposition_method, indent=4)

{   'CP': {   'best_error_method': {   'compression_ratio': 50.177304964539005,
                                       'duration': 440.7263283729553,
                                       'frobenius_error': 0.3439680049630961,
                                       'method_name': 'TensorLy_CP_image_numpy_truncated_svd_random_False_False_abs_rec_error_0_1e-08',
                                       'total_memory': 2577.8046875},
              'best_memory_method': {   'compression_ratio': 50.177304964539005,
                                        'duration': 6.756335496902466,
                                        'frobenius_error': 0.46598981134593487,
                                        'method_name': 'TensorLy_CP_image_pytorch_truncated_svd_random_False_False_abs_rec_error_0_1e-08',
                                        'total_memory': 528.0234375},
              'best_time_method': {   'compression_ratio': 50.177304964539005,
                                      'duratio

## Метрики в разрезе типов данных и методов

In [265]:
for data_type, metrics in analysis_results_data_types_decompositions_methods.items():
    print(f"\n{data_type}\n")
    pprint(metrics, indent=4)


video

{   'TensorTrain': {   'best_error_method': {   'compression_ratio': 50.335521440935175,
                                                'duration': 2.6868460178375244,
                                                'frobenius_error': 0.22329173516482115,
                                                'method_name': 'T3F_TensorTrain_video',
                                                'total_memory': 7386.12158203125},
                       'best_memory_method': {   'compression_ratio': 50.335521440935175,
                                                 'duration': 5.771303653717041,
                                                 'frobenius_error': 0.22822143509984016,
                                                 'method_name': 'TensorLy_TensorTrain_video_pytorch_randomized_svd',
                                                 'total_memory': 5226.3984375},
                       'best_time_method': {   'compression_ratio': 50.335521440935175,
                    

# Аналитика

## Обработка данных

In [266]:
df_logs.shape

(871, 31)

In [267]:
df_logs.head()

Unnamed: 0,method_name,gpu_allocated_memory_used_min,gpu_allocated_memory_used_max,gpu_allocated_memory_used_mean,gpu_allocated_memory_used,gpu_cached_memory_used_min,gpu_cached_memory_used_max,gpu_cached_memory_used_mean,gpu_cached_memory_used,ram_mem_used_min,...,compression_ratio_min,compression_ratio_max,compression_ratio_mean,compression_ratio,decomposition_method,data_type,language,library,tensor_type,platform
0,TensorLy_Tucker_video_pytorch_truncated_svd_svd,0.0,266.816895,53.363379,266.816895,3792.0,4312.0,3897.2,4312.0,2554.921875,...,50.248883,50.248883,50.248883,50.248883,Tucker,video,Python,TensorLy,Dense,"CPU, GPU"
1,TensorLy_Tucker_video_pytorch_truncated_svd_ra...,0.0,258.691895,51.738379,258.691895,3616.0,3618.0,3617.2,3618.0,3227.910156,...,50.248883,50.248883,50.248883,50.248883,Tucker,video,Python,TensorLy,Dense,"CPU, GPU"
2,TensorLy_Tucker_video_pytorch_symeig_svd_random,0.0,258.691895,51.738379,258.691895,3616.0,3622.0,3618.4,3622.0,3233.460938,...,50.248883,50.248883,50.248883,50.248883,Tucker,video,Python,TensorLy,Dense,"CPU, GPU"
3,TensorLy_Tucker_video_pytorch_randomized_svd_svd,0.0,258.691895,51.738379,258.691895,5016.0,5020.0,5017.2,5020.0,3233.9375,...,50.248883,50.248883,50.248883,50.248883,Tucker,video,Python,TensorLy,Dense,"CPU, GPU"
4,TensorLy_Tucker_video_pytorch_randomized_svd_r...,0.0,258.691895,51.738379,258.691895,3616.0,3618.0,3617.6,3618.0,3234.621094,...,50.248883,50.248883,50.248883,50.248883,Tucker,video,Python,TensorLy,Dense,"CPU, GPU"


In [268]:
df_filtered_for_analytics = df_logs[
    [
        "method_name",
        "gpu_allocated_memory_used",
        "gpu_allocated_memory_used_mean",
        "gpu_cached_memory_used",
        "gpu_cached_memory_used_mean",
        "ram_mem_used",
        "ram_mem_used_mean",
        "duration",
        "duration_mean",
        "frobenius_error",
        "frobenius_error_mean",
        "compression_ratio",
        "compression_ratio_mean",
        "data_type",
        "decomposition_method",
    ]
]

In [269]:
dfs_by_decomposition_method = {
    decomposition_method: df_filtered_for_analytics[df_filtered_for_analytics["decomposition_method"] == decomposition_method]
    for decomposition_method in df_filtered_for_analytics["decomposition_method"].unique()
}

In [270]:
for decomposition_method, df_by_decomposition_method in dfs_by_decomposition_method.items():
    enriched_rows = []
    for _, row in df_by_decomposition_method.iterrows():
        matching_log = next((log for log in logs if row["method_name"] == log["method_name"]), None)

        enriched_row = row.copy()

        if row["decomposition_method"] == "Tucker":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "TensorTrain":
            enriched_row["svd"] = matching_log["method_args"].get("svd")
        elif row["decomposition_method"] == "CP":
            enriched_row["init"] = matching_log["method_args"].get("init")
            enriched_row["svd"] = matching_log["method_args"].get("svd")
            enriched_row["normalize_factors"] = matching_log["method_args"].get("normalize_factors")
            enriched_row["orthogonalise"] = matching_log["method_args"].get("orthogonalise")
            enriched_row["tol"] = matching_log["method_args"].get("tol")
            enriched_row["l2_reg"] = matching_log["method_args"].get("l2_reg")
            enriched_row["cvg_criterion"] = matching_log["method_args"].get("cvg_criterion")

        enriched_rows.append(enriched_row)

    dfs_by_decomposition_method[decomposition_method] = pd.DataFrame(enriched_rows)

In [271]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Методы для графиков

In [272]:
def preprocess_dataframe(df):
    df_encoded = df.copy()
    for column in df_encoded.columns:
        if df_encoded[column].dtype == "object" and column != "method_name":
            df_encoded[column] = LabelEncoder().fit_transform(df_encoded[column])

    scaler = MinMaxScaler()
    df_encoded[df_encoded.columns.difference(["method_name"])] = scaler.fit_transform(df_encoded[df_encoded.columns.difference(["method_name"])])
    return df_encoded

In [273]:
def plot_heatmap(df, title):
    folder_path = Path("../.cache/data_analyze/")

    corr_matrix = df.drop(columns=["method_name"]).corr()

    fig = go.Figure(
        data=go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.index,
            colorscale=colors.sequential.RdBu,
            colorbar={
                "title": "Correlation",
                "titlefont": {"color": "black"},
                "tickfont": {"color": "black"},
            },
            text=corr_matrix.values,
            texttemplate="%{text:.2f}",
            textfont={"size": 12, "color": "black"},
            hovertemplate="<b>Correlation:</b> %{z:.8f}<extra></extra>",
        )
    )

    fig.update_layout(
        title=f"Heatmap: {title}",
        title_font={"color": "black"},
        xaxis={"tickangle": -45, "tickfont": {"size": 12, "color": "black"}},
        yaxis={"tickfont": {"size": 12, "color": "black"}},
        width=1400,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
    )

    folder_path.mkdir(parents=True, exist_ok=True)

    fig.write_html(folder_path / f"heatmap_{title}.html")

In [274]:
def plot_pca(df, title):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pca_df = df.drop(columns=["method_name"])
    pca = PCA(n_components=3)
    pca_components = pca.fit_transform(pca_df)

    pca_df = pd.DataFrame(
        {"PCA1": pca_components[:, 0], "PCA2": pca_components[:, 1], "PCA3": pca_components[:, 2], "method_name": df["method_name"].to_numpy()}
    )

    fig1 = px.scatter(pca_df, x="PCA1", y="PCA2", color="method_name", title=f"PCA 2D Projection: {title}", hover_name="method_name")
    fig1.update_traces(marker={"size": 10})
    fig1.update_layout(
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        xaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        yaxis={
            "titlefont": {"size": 12, "color": "black"},
            "tickfont": {"size": 12, "color": "black"},
        },
        showlegend=False,
    )

    fig1.write_html(folder_path / f"pca_2d_{title}.html")

    fig2 = px.scatter_3d(
        pca_df,
        x="PCA1",
        y="PCA2",
        z="PCA3",
        color="method_name",
        title=f"PCA 3D Projection: {title}",
        hover_name="method_name",
    )
    fig2.update_traces(marker={"size": 5})
    fig2.update_layout(
        scene={
            "xaxis_title": "PCA1",
            "yaxis_title": "PCA2",
            "zaxis_title": "PCA3",
            "xaxis_backgroundcolor": "white",
            "yaxis_backgroundcolor": "white",
            "zaxis_backgroundcolor": "white",
            "xaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "yaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "zaxis": {
                "titlefont": {"size": 12, "color": "black"},
                "tickfont": {"size": 12, "color": "black"},
            },
            "bgcolor": "white",
        },
        width=1800,
        height=1400,
        plot_bgcolor="white",
        paper_bgcolor="white",
        title_font={"color": "black"},
        showlegend=False,
    )

    fig2.write_html(folder_path / f"pca_3d_{title}.html")

In [275]:
def plot_pairplot(df, title, sample_size=1000, num_axes=100):
    folder_path = Path("../.cache/data_analyze/")
    folder_path.mkdir(parents=True, exist_ok=True)

    pairplot_df = df.drop(columns=["method_name"])

    if len(pairplot_df) > sample_size:
        pairplot_df = pairplot_df.sample(n=sample_size, random_state=42)

    selected_axes = pairplot_df.columns[:num_axes]

    sns.set(style="white")
    g = sns.pairplot(pairplot_df[selected_axes], diag_kind="kde", markers="+", plot_kws={"s": 50})
    g.fig.suptitle(f"Pairplot: {title}", y=1.02, fontsize=16, color="black")

    for ax in g.axes.flatten():
        ax.tick_params(axis="x", labelsize=12, labelcolor="black")
        ax.tick_params(axis="y", labelsize=12, labelcolor="black")
        ax.set_facecolor("white")
        ax.set_xticks(ax.get_xticks())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

    plt.savefig(folder_path / f"pairplot_{title}.png", dpi=300, bbox_inches="tight")
    plt.close()

In [276]:
dfs_by_decomposition_method_encoded = {}

In [277]:
dfs_by_decomposition_method.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

In [278]:
for key, df_by_decomposition_method in dfs_by_decomposition_method.items():
    dfs_by_decomposition_method_encoded[key] = preprocess_dataframe(df_by_decomposition_method.drop("decomposition_method", axis=1))

In [237]:
# dfs_by_decomposition_method['CP']

In [238]:
# dfs_by_decomposition_method_encoded['CP']

In [239]:
dfs_by_decomposition_method_encoded.keys()

dict_keys(['Tucker', 'TensorTrain', 'CP'])

## Heatmap

In [240]:
method_name = "Tucker"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [241]:
method_name = "TensorTrain"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [242]:
method_name = "CP"
plot_heatmap(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## PCA

In [243]:
method_name = "Tucker"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [244]:
method_name = "TensorTrain"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

In [245]:
method_name = "CP"
plot_pca(dfs_by_decomposition_method_encoded[method_name], title=method_name)

## Pairplot

In [246]:
method_name = "Tucker"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)

<IPython.core.display.Javascript object>

In [247]:
method_name = "TensorTrain"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)

<IPython.core.display.Javascript object>

In [248]:
method_name = "CP"
plot_pairplot(dfs_by_decomposition_method_encoded[method_name], title=method_name)

<IPython.core.display.Javascript object>