##### Crear un SparkSession

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Fundamentos").getOrCreate()

##### Leer los datos

In [4]:
df = spark.read.json("gs://analitica_bucket_20220926/datos/people.json")

In [5]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [6]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.columns

['age', 'name']

In [8]:
df.describe()

DataFrame[summary: string, age: string, name: string]

Estructuras para el DataFrame

In [9]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType 

In [10]:
esquema_campos = [
    StructField("age",IntegerType(),True),
    StructField("name",StringType(),True)
]

In [11]:
estructura = StructType(fields=esquema_campos)

In [12]:
df = spark.read.json(
    "gs://analitica_bucket_20220926/datos/people.json",
    schema=estructura
)

In [13]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



Manipulando los datos

In [14]:
df['age']

Column<b'age'>

In [15]:
type(df['age'])

pyspark.sql.column.Column

In [16]:
df.select('age')

DataFrame[age: int]

In [17]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [18]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [19]:
df.head()

Row(age=None, name='Michael')

In [20]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

Multiples Columnas

In [21]:
df.select(['age','name'])

DataFrame[age: int, name: string]

In [22]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Agregar nuevas columnas

In [23]:
df.withColumn('nuevaedad',df['age']).show()

+----+-------+---------+
| age|   name|nuevaedad|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       30|
|  19| Justin|       19|
+----+-------+---------+



In [24]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Columna calculada

In [25]:
df.withColumn("edaddoble",df['age']*2).show()

+----+-------+---------+
| age|   name|edaddoble|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [26]:
df.withColumn("edadmasuno",df['age']+1).show()

+----+-------+----------+
| age|   name|edadmasuno|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        31|
|  19| Justin|        20|
+----+-------+----------+



In [27]:
df.withColumn("mitadedad",df['age']/2).show()

+----+-------+---------+
| age|   name|mitadedad|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|     15.0|
|  19| Justin|      9.5|
+----+-------+---------+



## Uso SQL

Para realizar consultas SQL con un DataFrame es necesario registrarlo como una vista temporal

In [28]:
df.createOrReplaceTempView("personas")

In [29]:
consulta01 = spark.sql("SELECT * FROM personas")

In [30]:
consulta01

DataFrame[age: int, name: string]

In [31]:
consulta01.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [32]:
spark.sql("SELECT * FROM personas WHERE age=30").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

