### Instalação das bibliotecas

In [1]:
%%sh
pip install spark
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spark
  Downloading spark-0.2.1.tar.gz (41 kB)
Building wheels for collected packages: spark
  Building wheel for spark (setup.py): started
  Building wheel for spark (setup.py): finished with status 'done'
  Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58762 sha256=4c482bfeefa37c8e512b9e5f21f9fc7e60ca3a265a2082a6c4de7eeb6062bc5e
  Stored in directory: /root/.cache/pip/wheels/4e/0e/f1/164619f9920fb447d294afaae11a7715bd442ded7225953d72
Successfully built spark
Installing collected packages: spark
Successfully installed spark-0.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup

### Importação das bibliotecas

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

### Criar / Iniciar uma Sessão PySpark

In [3]:
spark = (
    SparkSession\
    .builder\
    .master('local')\
    .appName('particionamento')\
    .getOrCreate()
)

### Criando um banco de dados

In [4]:
spark.sql('SHOW DATABASES').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [5]:
spark.sql('CREATE DATABASE wc_players').show()

++
||
++
++



In [6]:
spark.sql('USE wc_players').show()

++
||
++
++



### Importando o arquivo CSV

In [9]:
wc_players = spark.read.csv('/content/wc2018-players.csv', header=True, inferSchema=True, sep=',')

In [10]:
wc_players.show(1)

+---------+---+----+------------------+----------+----------+--------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|          Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|AFC Ajax (NED)|   169|    65|
+---------+---+----+------------------+----------+----------+--------------+------+------+
only showing top 1 row



### Criando uma partição<br>Criando e convertendo o arquivo .csv em uma tabela Spark SQL
https://prnt.sc/LM6Z7CGrSaHj

In [11]:
wc_players.write.partitionBy('Team').saveAsTable('wc_players')

In [14]:
# OBS: A coluna que foi particionada sempre ficará em último
spark.sql('SELECT * FROM wc_players').show(1, truncate=False)

+---+----+-------------------+----------+-----------+-------------------------+------+------+-------+
|#  |Pos.|FIFA Popular Name  |Birth Date|Shirt Name |Club                     |Height|Weight|Team   |
+---+----+-------------------+----------+-----------+-------------------------+------+------+-------+
|2  |MF  |KROHN-DEHLI Michael|06.06.1983|KROHN-DEHLI|Deportivo La Coruña (ESP)|171   |69    |Denmark|
+---+----+-------------------+----------+-----------+-------------------------+------+------+-------+
only showing top 1 row



### bucketBy()
https://prnt.sc/1XRk7y8l5ts6

In [15]:
# bucketBy(numero_particionamento, coluna)
wc_players.write.bucketBy(5, 'Team')\
          .saveAsTable('wc_players2')

In [16]:
spark.sql('SELECT * FROM wc_players').show(1)

+---+----+-------------------+----------+-----------+--------------------+------+------+-------+
|  #|Pos.|  FIFA Popular Name|Birth Date| Shirt Name|                Club|Height|Weight|   Team|
+---+----+-------------------+----------+-----------+--------------------+------+------+-------+
|  2|  MF|KROHN-DEHLI Michael|06.06.1983|KROHN-DEHLI|Deportivo La Coru...|   171|    69|Denmark|
+---+----+-------------------+----------+-----------+--------------------+------+------+-------+
only showing top 1 row



### Cache e Persistência
- Cache - padrão em memória e disco
- Persist - Definido pelo usuário
- StorageLevel
    - MEMORY_ONLY: padrão para RDD, porém se não caber na memória será reprocessado a cada consulta

    - MEMORY_AND_DISK: Padrão para DataFrame. Aramzena as partições que não cabem em memória em disco

In [17]:
# disco 
# memória  
# offheap
# serializado
# replicação

wc_players.storageLevel

StorageLevel(False, False, False, False, 1)

In [18]:
# DISCO
# MEMÓRIA  
# offheap
# serializado
# replicação
wc_players.cache()

DataFrame[Team: string, #: int, Pos.: string, FIFA Popular Name: string, Birth Date: string, Shirt Name: string, Club: string, Height: int, Weight: int]

In [19]:
wc_players.storageLevel

StorageLevel(True, True, False, True, 1)

### Criando um tipo de storageLevel

In [24]:
from pyspark.storagelevel import StorageLevel

In [25]:
wc_players.persist(StorageLevel.DISK_ONLY)

DataFrame[Team: string, #: int, Pos.: string, FIFA Popular Name: string, Birth Date: string, Shirt Name: string, Club: string, Height: int, Weight: int]

In [26]:
# tirar do cache
wc_players.unpersist()

DataFrame[Team: string, #: int, Pos.: string, FIFA Popular Name: string, Birth Date: string, Shirt Name: string, Club: string, Height: int, Weight: int]

In [27]:
wc_players.persist(StorageLevel.DISK_ONLY)

DataFrame[Team: string, #: int, Pos.: string, FIFA Popular Name: string, Birth Date: string, Shirt Name: string, Club: string, Height: int, Weight: int]

In [28]:
wc_players.storageLevel

StorageLevel(True, False, False, False, 1)