# Feature Representation

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
def show_plots(features, original_and_transformed_dataframe, x, y, grid_shape_x, grid_shape_y, figsize=(20, 10)):
    # Crea la figura per i subplots
    fig, axes = plt.subplots(grid_shape_x, grid_shape_y, figsize=figsize)

    # Appiattisci gli assi per semplificare l'indicizzazione
    axes = axes.ravel()

    for i, feature in enumerate(features):
        # Determina l'asse corrispondente
        ax = axes[i]

        # Crea uno scatter plot su x e y
        scatter = ax.scatter(
            original_and_transformed_dataframe[x],
            original_and_transformed_dataframe[y],
            c=original_and_transformed_dataframe[feature],  # Colora in base alla feature
            cmap='viridis',  # Cambia la palette di colori se necessario
            s=60,  # Dimensione dei punti
            edgecolor='k'  # Bordo dei punti
        )

        # Imposta le etichette degli assi
        ax.set_xlabel(x)
        ax.set_ylabel(y)

        # Aggiungi una barra di colori
        fig.colorbar(scatter, ax=ax, label=feature)

    # if number of features is odd, remove the last subplot
    if len(features) % 2 != 0:
        fig.delaxes(axes[-1])

    # Regola il layout per evitare sovrapposizioni
    plt.tight_layout()

    # Mostra i grafici
    plt.show()

In [4]:
def mae_for_umap_components(data_scaled, components_range):
    reconstruction_errors = []

    for components in components_range:
        umap_reducer = umap.UMAP(n_components=components)
        umap_embeddings = umap_reducer.fit_transform(data_scaled)
        print("\nstarting - ", components)
        # Calcolo della ricostruzione inversa
        reconstructed_data = umap_reducer.inverse_transform(umap_embeddings)
        print("inverse done - ", components)
        mae = mean_absolute_error(data_scaled, reconstructed_data)
        print("mae done - ", components)
        reconstruction_errors.append(mae)

    # Traccia l'errore di ricostruzione
    plt.plot(components_range, reconstruction_errors, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Reconstruction Error (MAE)')
    plt.title('UMAP Reconstruction Error vs Components')
    plt.show()

## Races

In [None]:
races_dataset = pd.read_csv('../data/races_dataset_no_outliers.csv').drop(['uci_points', 'profile'], axis=1)
races_dataset = races_dataset.select_dtypes(include=['number'])
races_dataset = races_dataset.dropna().reset_index(drop=True)

races_dataset.info()

scaler = StandardScaler()
races_dataset_scaled = scaler.fit_transform(races_dataset)

### PCA

In [16]:
# PCA
pca_instance = PCA()
pca_transformation = pca_instance.fit_transform(races_dataset_scaled)

In [None]:
# variance per component
eigenvalues = pca_instance.explained_variance_
# variance per component, scaled to sum to 1
scaled_eigenvalues = pca_instance.explained_variance_ratio_
# eigenvectors, i.e., axes of reference
eigenvectors = pca_instance.components_


# norm of the whole transformation
transformation_norm_per_column = (pca_transformation ** 2).sum(axis=0)
cumulative_norm_per_reduction = np.cumsum(transformation_norm_per_column)

pca_dataframe = pd.DataFrame.from_records(
    zip(
        eigenvalues,
        scaled_eigenvalues,
        cumulative_norm_per_reduction
    ),
    columns=["eigenvalues", "scaled_eigenvalues", "transformation_norm"]
)
pca_dataframe

In [None]:
sb.lineplot(
    pca_dataframe,
    y="scaled_eigenvalues",
    x=pca_dataframe.index
)

L'analisi del plot non mostra un elbo chiaro, quindi è possibile che non esistano componenti principali tra le 6 analizzate, secondo PCA.

In [19]:
pca_dataframe = pd.DataFrame(pca_transformation[:, :4], columns=["pca_1", "pca_2", "pca_3", "pca_4"])
original_and_transformed_dataframe = pd.concat(
    [
        races_dataset,
        pca_dataframe
    ],
    axis="columns"
)

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_1", "pca_2", 4, 2, figsize=(20,30))

PCA 1-2
- startlist-quality
- race_year

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_1", "pca_3", 4, 2, figsize=(20,30))

PCA 1-3
- startlist-quality
- race_year

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_1", "pca_4", 4, 2, figsize=(20,30))

PCA 1-4
- startlist-quality
- race_year

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_2", "pca_3", 4, 2, figsize=(20,30))

PCA 2-3
- startlist-quality
- race_year

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_2", "pca_4", 4, 2, figsize=(20,30))

PCA 2-4
- startlist-quality
- race_year

In [None]:
show_plots(races_dataset.columns, original_and_transformed_dataframe, "pca_3", "pca_4", 4, 2, figsize=(20,30))

PCA 3 - 4
- startlist-quality
- race_year

Summary:
Considering all the possible alignments of the features with the principal components found by PCA, we can notice how both thhe features "startlist-quality" and "race_year" present a poor alignment in general. In our analysis these should be considered redundant features and will be therefore removed during the clustering of cyclists data.

### UMAP

UMAP può essere valutato calcolando quanto bene i dati ridotti riescono a ricostruire l'originale.
Possiamo provare diverse configurazioni di `n_components` e confrontare la qualità della ricostruzione utilizzando il parametro inverso (`inverse_transform`) o altre metriche di distanza.

In [None]:
mae_for_umap_components(races_dataset_scaled, range(2, 7))

In [None]:
# UMAP
umap_reducer = umap.UMAP(n_components=5, random_state=42)
umap_embeddings = umap_reducer.fit_transform(races_dataset_scaled)

In [None]:
umap_embeddings

In [None]:
umap_dataframe = pd.DataFrame(umap_embeddings[:, :5], columns=["umap_1", "umap_2", "umap_3", "umap_4", "umap_5"])
umap_original_and_transformed_dataframe = pd.concat(
    [
        races_dataset,
        umap_dataframe
    ],
    axis="columns"
)

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_1", "umap_2", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_1", "umap_3", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_1", "umap_4", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_1", "umap_5", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_2", "umap_3", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_2", "umap_4", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_2", "umap_5", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_3", "umap_4", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_3", "umap_5", 4, 2, figsize=(20,20))

In [None]:
show_plots(races_dataset.columns, umap_original_and_transformed_dataframe, "umap_4", "umap_5", 4, 2, figsize=(20,20))