# Comparando diferentes implementaciones de PCA

## 💫 Spark - Mlib

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA, VectorAssembler
from pyspark.ml.linalg import Vectors

In [10]:
# 1. Crear SparkSession
spark = SparkSession.builder \
    .appName("PCA Example") \
    .getOrCreate()

# 2. Cargar datos desde archivo de texto
data_path = "./input/datos_1.txt"

# Leer archivo de texto y convertir a filas numéricas
rdd = spark.sparkContext.textFile(data_path) \
    .map(lambda line: [float(x) for x in line.strip().split(",")])

# Crear un DataFrame con nombres de columnas
num_cols = len(rdd.first())
columns = [f"feature_{i}" for i in range(num_cols)]
df = rdd.toDF(columns)

# 3. Ensamblar las columnas en una sola columna 'features'
assembler = VectorAssembler(inputCols=columns, outputCol="features")
assembled = assembler.transform(df)

# 4. Aplicar PCA (por ejemplo, a 2 componentes)
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(assembled)
result = model.transform(assembled)

# 5. Mostrar resultado
result.select("pca_features").show(truncate=False)

# 6. Detener Spark
spark.stop()


25/05/08 18:55:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+------------------------------------------+
|pca_features                              |
+------------------------------------------+
|[-0.047992098823377724,0.9988477153453011]|
|[-0.14397629647013316,2.9965431460359033] |
|[-0.9988477153453011,-0.04799209882337772]|
|[-1.0468398141686788,0.9508556165219234]  |
|[-1.1428240118154342,2.9485510472125256]  |
|[-2.0936796283373575,1.9017112330438468]  |
|[-2.1416717271607353,2.900558948389148]   |
|[-2.9965431460359033,-0.14397629647013316]|
|[-3.092527343682659,1.853719134220469]    |
|[-3.1405194425060365,2.85256684956577]    |
|[-4.043382960204582,0.8068793200517903]   |
|[-4.139367157851337,2.8045747507423924]   |
|[-4.994238576726506,-0.23996049411688858] |
|[-5.042230675549884,0.7588872212284126]   |
|[-5.138214873196639,2.7565826519190146]   |
|[-6.089070489718562,1.709742837750336]    |
|[-6.13706258854194,2.708590553095637]     |
+------------------------------------------+



In [15]:
from ppca import PPCA
import numpy as np

pathcsv = "./input/datos_1.txt"
Y_p = np.genfromtxt(pathcsv, delimiter=",")  # convierte a numpy array

# Fit PPCA
model = PPCA()
model.fit(Y_p, d=2)

# Get the transformed latent space
X_p = model.transform()


  return (X - self.means) / self.stds


In [18]:
model.C

array([[-7.07106806e-01,  7.07106742e-01],
       [-7.07106737e-01, -7.07106819e-01],
       [-1.66741348e-04,  3.27693236e-05]])

In [16]:
X_p

array([[ 1.59980576, -0.58680118],
       [ 0.36972898, -1.81687807],
       [ 1.85040963,  0.39267174],
       [ 1.23537124, -0.2223667 ],
       [ 0.00529446, -1.45244359],
       [ 0.25589833, -0.47297067],
       [-0.35914006, -1.08800912],
       [ 1.12154059,  1.12154069],
       [-0.10853619, -0.1085362 ],
       [-0.72357458, -0.72357464],
       [ 0.14206768,  0.87093672],
       [-1.08800909, -0.35914017],
       [ 0.39267156,  1.85040965],
       [-0.22236683,  1.2353712 ],
       [-1.45244361,  0.00529431],
       [-1.20183974,  0.98476723],
       [-1.81687813,  0.36972879]])