In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark.read.json("gente.json")

In [3]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  30|   Mary|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
|   Mary|
| Justin|
+-------+



In [6]:
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
|   Mary|       31|
| Justin|       20|
+-------+---------+



In [7]:
df.filter(df['age'] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
| 30|Mary|
+---+----+



In [8]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    2|
+----+-----+



In [9]:
df.createOrReplaceTempView("people")

In [10]:
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  30|   Mary|
|  19| Justin|
+----+-------+



In [11]:
from pyspark.sql import Row

sc = spark.sparkContext

# Cargar un fichero de texto y convertir cada fila a Row.
lines = sc.textFile("gente.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Crear el dataframe (infiriendo el esquema)
dfPeople = spark.createDataFrame(people)

# Registrar el DataFrame como tabla.
dfPeople.createOrReplaceTempView("people")

# Ejecutar SQL
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

teenagers.show()

+------+
|  name|
+------+
|Justin|
+------+



In [12]:
# Importar tipos de datos
from pyspark.sql.types import *

sc = spark.sparkContext

# Cargar un fichero de texto y convertir cada fila en una tupla
lines = sc.textFile("gente.txt")
parts = lines.map(lambda l: l.split(","))
# Cada línea se convierte en una tupla
people = parts.map(lambda p: (p[0], int(p[1])))

# Crear el esquema
fields = [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
schema = StructType(fields)

# Crear el dataframe aplicando el esquema al RDD
dfPeople = spark.createDataFrame(people, schema)

# Registrar el DataFrame como tabla
dfPeople.createOrReplaceTempView("people")

# Ejecutar SQL
results = spark.sql("SELECT name,age FROM people")

results.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+

