##### Crear un SparkSession

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("Fundamentos").getOrCreate()

##### Leer los datos

In [0]:
df = spark.read.json("dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/people.json")

In [0]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [0]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [0]:
df.columns

Out[7]: ['age', 'name']

In [0]:
df.describe()

Out[8]: DataFrame[summary: string, age: string, name: string]

Estructuras para el DataFrame

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType 

In [0]:
esquema_campos = [
    StructField("age",IntegerType(),True),
    StructField("name",StringType(),True)
]

In [0]:
estructura = StructType(fields=esquema_campos)

In [0]:
df = spark.read.json(
    "dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/people.json",
    schema=estructura
)

In [0]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



Manipulando los datos

In [0]:
df['age']

Out[14]: Column<'age'>

In [0]:
type(df['age'])

Out[15]: pyspark.sql.column.Column

In [0]:
df.select('age')

Out[16]: DataFrame[age: int]

In [0]:
type(df.select('age'))

Out[17]: pyspark.sql.dataframe.DataFrame

In [0]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [0]:
df.head()

Out[21]: Row(age=None, name='Michael')

In [0]:
df.head(2)

Out[22]: [Row(age=None, name='Michael'), Row(age=30, name='Andy')]

Multiples Columnas

In [0]:
df.select(['age','name'])

Out[23]: DataFrame[age: int, name: string]

In [0]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Agregar nuevas columnas

In [0]:
df.withColumn('nuevaedad',df['age']).show()

+----+-------+---------+
| age|   name|nuevaedad|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       30|
|  19| Justin|       19|
+----+-------+---------+



In [0]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Columna calculada

In [0]:
df.withColumn("edaddoble",df['age']*2).show()

+----+-------+---------+
| age|   name|edaddoble|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [0]:
df.withColumn("edadmasuno",df['age']+1).show()

+----+-------+----------+
| age|   name|edadmasuno|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        31|
|  19| Justin|        20|
+----+-------+----------+



In [0]:
df.withColumn("mitadedad",df['age']/2).show()

+----+-------+---------+
| age|   name|mitadedad|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|     15.0|
|  19| Justin|      9.5|
+----+-------+---------+



##Uso SQL

Para realizar consultas SQL con un DataFrame es necesario registrarlo como una vista temporal

In [0]:
df.createOrReplaceTempView("personas")

In [0]:
consulta01 = spark.sql("SELECT * FROM personas")

In [0]:
consulta01

Out[35]: DataFrame[age: int, name: string]

In [0]:
consulta01.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [0]:
spark.sql("SELECT * FROM personas WHERE age=30").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

