# Feature Representation

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.decomposition import PCA

In [None]:
dataset = pd.read_csv('../data/complete_dataset.csv') 
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')

In [None]:
dataset['profile'] = dataset['profile'].astype('category')
numeric_dataset = dataset.select_dtypes(include="number")
numeric_dataset = numeric_dataset.fillna(0)

pca_instance = PCA()
pca_instance.fit(numeric_dataset.values)
pca_transformation = pca_instance.transform(numeric_dataset.values)

In [None]:
# variance per component
eigenvalues = pca_instance.explained_variance_
# variance per component, scaled to sum to 1
scaled_eigenvalues = pca_instance.explained_variance_ratio_
# eigenvectors, i.e., axes of reference
eigenvectors = pca_instance.components_


# norm of the whole transformation
transformation_norm_per_column = (pca_transformation ** 2).sum(axis=0)
cumulative_norm_per_reduction = np.cumsum(transformation_norm_per_column)

pca_dataframe = pd.DataFrame.from_records(
    zip(
        eigenvalues,
        scaled_eigenvalues,
        cumulative_norm_per_reduction
    ),
    columns=["eigenvalues", "scaled_eigenvalues", "transformation_norm"]
)
pca_dataframe

In [None]:
sb.lineplot(
    pca_dataframe,
    y="scaled_eigenvalues",
    x=pca_dataframe.index
)

In [None]:
pca_dataframe = pd.DataFrame(pca_transformation[:, :2], columns=["pca_x", "pca_y"]) #TODO: if more than 2 components are needed, change this line
original_and_transformed_dataframe = pd.concat(
    [
        dataset,
        pca_dataframe
    ],
    axis="columns"
)

In [None]:
'''sb.scatterplot(
    original_and_transformed_dataframe,
    x="pca_x",
    y="pca_y",
    hue="race_name"
)'''