# Transformaciones de Datos
https://spark.apache.org/docs/latest/ml-features.html

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('transformacion').getOrCreate()

In [0]:
ruta = 'dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/fake_customers.csv'

In [0]:
df = spark.read.csv(ruta,inferSchema=True,header=True)

In [0]:
df.show()

+-------+----------+-----+
|   Name|     Phone|Group|
+-------+----------+-----+
|   John|4085552424|    A|
|   Mike|3105552738|    B|
| Cassie|4085552424|    B|
|  Laura|3105552438|    B|
|  Sarah|4085551234|    A|
|  David|3105557463|    C|
|   Zach|4085553987|    C|
|  Kiera|3105552938|    A|
|  Alexa|4085559467|    C|
|Karissa|3105553475|    A|
+-------+----------+-----+



# Características

#### Indización de cadenas
Frecuentemente necesitamos convertir cadenas en información numérica.

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
df = spark.createDataFrame([(0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c")],
                           ["id_usua","categoria"])

In [0]:
df.show()

+-------+---------+
|id_usua|categoria|
+-------+---------+
|      0|        a|
|      1|        b|
|      2|        c|
|      3|        a|
|      4|        a|
|      5|        c|
+-------+---------+



In [0]:
indexador = StringIndexer(inputCol="categoria",outputCol="indiceCategoria")

In [0]:
indexado = indexador.fit(df).transform(df)

In [0]:
indexado.show()

+-------+---------+---------------+
|id_usua|categoria|indiceCategoria|
+-------+---------+---------------+
|      0|        a|            0.0|
|      1|        b|            2.0|
|      2|        c|            1.0|
|      3|        a|            0.0|
|      4|        a|            0.0|
|      5|        c|            1.0|
+-------+---------+---------------+



El siguiente paso sería transformar los índices de categoria a variables "dummy"

#### Indexador de Vectores

La clase VectorAssembler permite combinar una lista de columnas en una solo vector columna.

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data01 = spark.createDataFrame(
    [(0,18,1.0,Vectors.dense([0.0,10.0,0.5]),1.0)],
    ["id","hora","movil","caracteristicas_usua","clic"]
)

In [0]:
data01.show()

+---+----+-----+--------------------+----+
| id|hora|movil|caracteristicas_usua|clic|
+---+----+-----+--------------------+----+
|  0|  18|  1.0|      [0.0,10.0,0.5]| 1.0|
+---+----+-----+--------------------+----+



In [0]:
ensamblador = VectorAssembler(
    inputCols=["hora","movil","caracteristicas_usua"],
    outputCol="caracteristicas"
)

In [0]:
salida = ensamblador.transform(data01)

In [0]:
salida.show()

+---+----+-----+--------------------+----+--------------------+
| id|hora|movil|caracteristicas_usua|clic|     caracteristicas|
+---+----+-----+--------------------+----+--------------------+
|  0|  18|  1.0|      [0.0,10.0,0.5]| 1.0|[18.0,1.0,0.0,10....|
+---+----+-----+--------------------+----+--------------------+



In [0]:
salida.select("caracteristicas","clic").show()

+--------------------+----+
|     caracteristicas|clic|
+--------------------+----+
|[18.0,1.0,0.0,10....| 1.0|
+--------------------+----+

