### Bibliotecas

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

### Setup

In [20]:
spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([('Pedro','4'),('João','5')], schema='name STRING, id STRING')

In [21]:
df.show()

+-----+---+
| name| id|
+-----+---+
|Pedro|  4|
| João|  5|
+-----+---+



### Acessando os tipos Spark

In [22]:
from pyspark.sql.types import *

In [23]:
int_type = IntegerType()

In [24]:
int_type

IntegerType

In [25]:
array_integer = ArrayType(IntegerType())

In [26]:
array_integer

ArrayType(IntegerType,true)

### Convertendo o tipo de Colunas

In [27]:
df.show()

+-----+---+
| name| id|
+-----+---+
|Pedro|  4|
| João|  5|
+-----+---+



In [28]:
df.dtypes

[('name', 'string'), ('id', 'string')]

In [30]:
df.select('name', col('id').cast('int'))

DataFrame[name: string, id: int]

In [31]:
df.select('name', col('id').cast(IntegerType()))

DataFrame[name: string, id: int]

### Schema e criação da DataFrames

In [33]:
df = spark.createDataFrame([('Pedro','4'),('João','5')])

In [37]:
df.dtypes

[('_1', 'string'), ('_2', 'string')]

In [36]:
df.show()

+-----+---+
|   _1| _2|
+-----+---+
|Pedro|  4|
| João|  5|
+-----+---+



### Criando schemas programaticamente

In [39]:
schema  = StructType([
    StructField('nome',StringType()),
    StructField('id',IntegerType())
])

In [41]:
df = spark.createDataFrame([('Pedro',4),('João',4)], schema=schema)

In [42]:
df.show()

+-----+---+
| nome| id|
+-----+---+
|Pedro|  4|
| João|  4|
+-----+---+



In [43]:
df.dtypes

[('nome', 'string'), ('id', 'int')]

### Criando schemas com DDL

In [45]:
schema = 'nome STRING,id INT'

In [46]:
df = spark.createDataFrame([('Pedro',4),('João',4)], schema=schema)

In [47]:
df.dtypes

[('nome', 'string'), ('id', 'int')]

### Criando DataFrames

In [49]:
data = [('Pedro',4),('João',4)]

In [50]:
df = spark.createDataFrame(data, schema)

In [51]:
df.show()

+-----+---+
| nome| id|
+-----+---+
|Pedro|  4|
| João|  4|
+-----+---+



In [52]:
df.printSchema()

root
 |-- nome: string (nullable = true)
 |-- id: integer (nullable = true)

