# Técnicas para la reducción de la dimensionalidad 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
# summarize the dataset
print(X.shape, y.shape)

(1000, 20) (1000,)


In [3]:
df = pd.DataFrame(X)
df.columns = df.columns.astype(str)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.080548,0.822733,-1.211753,2.421184,3.304243,-6.343576,-0.369886,0.064277,0.094521,-4.36443,-2.943058,0.502341,-2.339147,0.153031,0.543728,-0.420523,1.456127,2.882609,1.7916,-4.297088
1,-2.3303,-4.866086,-3.882913,-2.232483,1.445153,2.597391,3.689269,-1.651189,-2.47866,-1.719449,1.739934,-3.885896,-1.440399,3.128698,-5.370488,3.881865,0.759844,-0.145616,-0.554894,0.614208
2,-1.19715,1.555631,-0.618716,7.193674,-3.300375,-0.033224,4.182462,0.290963,0.886022,-0.685233,2.631218,1.015341,2.441568,0.932073,-1.899996,-3.049732,-3.174851,1.734818,0.130674,-3.133515
3,1.535769,-0.733499,0.204541,0.907992,-1.1428,-2.342064,2.533897,-1.140052,4.231472,0.032415,0.610521,2.093248,-2.800467,-2.09334,1.10282,1.38599,-0.476395,3.055135,1.764456,-1.132424
4,1.947908,3.409123,3.574408,-4.579258,3.181229,-2.982798,-3.618861,1.490834,3.125231,-1.153063,-2.563104,0.668844,-3.032168,-2.448262,2.317296,5.087334,1.568465,-0.429839,1.992009,2.669929


- Prueba de esfericidad de Bartlett: Esta prueba verifica si la matriz de correlación es significativamente diferente de la matriz de identidad. Un resultado significativo sugiere que hay suficientes correlaciones entre las variables para proceder con el análisis.

- Prueba de KMO (Kaiser-Meyer-Olkin): Mide la adecuación muestral. Un valor de KMO mayor a 0.6 generalmente se considera adecuado para realizar PCA o análisis factorial. Valores más cercanos a 1 indican que la factorización es apropiada.

```mermaid
flowchart TD
    cat?(Categorical data?) --> |"✅"| num_too?(Numerical data too?)
    num_too? --> |"✅"| FAMD
    num_too? --> |"❌"| multiple_cat?(More than two columns?)
    multiple_cat? --> |"✅"| MCA
    multiple_cat? --> |"❌"| CA
    cat? --> |"❌"| groups?(Groups of columns?)
    groups? --> |"✅"| MFA
    groups? --> |"❌"| shapes?(Analysing shapes?)
    shapes? --> |"✅"| GPA
    shapes? --> |"❌"| PCA
```

In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def PCA_Data_Exploration(df, n_components=2):
    # Separate features and target variable
    X = df.copy()

    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # PCA
    pca = PCA(n_components=n_components)  # Renamed PCA to pca
    components = pca.fit_transform(X)

    # Variance explained
    var_explained = pca.explained_variance_ratio_

    # DataFrame of principal components
    pcaDf = pd.DataFrame(data=components, columns=['PC1', 'PC2'], index = df.index)  # Assuming 2 components for simplicity

    # DataFrame of PCA coefficients
    coefs = pd.DataFrame(np.transpose(pca.components_), columns=pcaDf.columns, index=df.columns)
    # coefs = pd.DataFrame(pca.components_, index=pcaDf.columns, columns=df.columns)

    return var_explained, pcaDf, coefs

In [11]:
PCA_Data_Exploration(df)

(array([0.25657675, 0.1944439 ]),
           PC1       PC2
 0   -1.217545  0.026503
 1    3.268246  0.974665
 2   -0.023231 -0.915713
 3    0.134642 -1.545401
 4   -0.824325  1.352947
 ..        ...       ...
 995  0.806855 -0.195492
 996  0.537893 -0.761208
 997  0.577504 -1.547846
 998 -1.516509  0.562249
 999 -3.747064  0.939686
 
 [1000 rows x 2 columns],
          PC1       PC2
 0  -0.084542 -0.319162
 1  -0.221663  0.278080
 2  -0.040809  0.191524
 3  -0.330766 -0.034293
 4  -0.108151  0.355053
 5   0.278527  0.131885
 6   0.302254 -0.296494
 7  -0.186415  0.189129
 8   0.053782 -0.307644
 9  -0.131574 -0.185510
 10  0.223346  0.143625
 11 -0.261034  0.031500
 12 -0.200400  0.008890
 13  0.310076  0.081747
 14 -0.287154 -0.311857
 15  0.347202  0.150044
 16 -0.122211  0.328121
 17  0.202605 -0.294922
 18 -0.155265 -0.102951
 19  0.244200  0.182661)

In [None]:
# Exploración PCA
var_explained_Df, pcaDf, components_Df = PCA_Data_Exploration(df.dropna(), n_components=2)
var_explained_Df_grou_med, pcaDf_grou_med, components_Df_grou_med = PCA_Data_Exploration(df_grou_med, n_components=2)
var_explained_Df_mice, pcaDf_mice, components_Df_mice = PCA_Data_Exploration(df_mice, n_components=2)
var_explained_Df_knn, pcaDf_knn, components_Df_knn = PCA_Data_Exploration(df_knn, n_components=2)

names_df = ["pcaDf", "pcaDf_grou_med", "pca_Df_mice", "pcaDf_knn"]

fig, axs = plt.subplots(1, 4, figsize=(16, 5.5))  # 1 fila y 3 columnas para los subplots
for i, df_pca in enumerate([pcaDf, pcaDf_grou_med, pcaDf_mice, pcaDf_knn]):
    for month in df.index.month.unique().sort_values():
        df_month = df_pca[df_pca.index.month == month]
        axs[i].scatter(df_month.PC1, df_month.PC2, label=f"month {month}", alpha=.5, marker=".")
    name = names_df[i]
    axs[i].set_title(f"PCA - {name}")
    axs[i].set_xlabel("PC1")
    axs[i].set_ylabel("PC2")
    axs[i].set_xlim(pcaDf.PC1.min(), pcaDf.PC1.max())
    axs[i].set_ylim(pcaDf.PC2.min(), pcaDf.PC2.max())
    axs[i].axhline(0)
    axs[i].axvline(0)

# plt.ylim(-4,7)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()  # Ajusta el diseño para evitar superposiciones
plt.show()

In [None]:
# Iterar sobre los DataFrames y graficar
colors = ["royalblue","orange", "limegreen", "tomato"]
alphas = [1,.5,.5,.5]
plt.figure(figsize=[9,5])
for i, comps in enumerate([components_Df, components_Df_grou_med, components_Df_mice, components_Df_knn]):
    plt.scatter(comps["PC1"], comps["PC2"], label=names_df[i])
    for j in range(comps.shape[0]):
        plt.quiver(0, 0, comps["PC1"].iloc[j], comps["PC2"].iloc[j], angles='xy', scale_units='xy', scale=1, linewidth=.2, color= colors[i], alpha= alphas[i])
        if i == 0:
            plt.annotate(str(comps.index[j]), (comps["PC1"][j], comps["PC2"][j]))
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.axhline(0, color='grey', linestyle='-', linewidth=0.8)
    plt.axvline(0, color='grey', linestyle='-', linewidth=0.8)
    plt.legend()
plt.title("Loadings de datos reales vs imputados")
plt.show()

### PCA (Principal Component Analysis)

In [12]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression


X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
# Define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.824 (0.034)


### Singular Value Descomposition

In [9]:
from sklearn.decomposition import TruncatedSVD

# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
# define the pipeline
steps = [('svd', TruncatedSVD(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.824 (0.034)
