In [None]:
# Install dependencies
!apt-get update -qq
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
# Storing access in environment variable
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

In [4]:
# Initializing spark
import findspark
findspark.init()

In [5]:
# Initialize a Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder\
     .master('local[*]')\
    .appName("Iniciando com Spark")\
    .config('spark.ui.port', '4050')\
    .getOrCreate()

In [None]:
# Download the ngrok files to use in the spark session
!wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [7]:
# Expose ngrok access server
get_ipython().system_raw('./ngrok http 4050 &')

In [None]:
# Create a database
spark.sql('CREATE DATABASE IF NOT EXISTS  desp')

In [None]:
# Command to position yourself within the desp database
spark.sql('USE desp')

In [None]:
# Insert the file into a dataframe and display it
churn_df = spark.read.csv('/content/Churn.csv', sep=';', header=True, inferSchema=True)
churn_df.show()

In [13]:
# Partition the churn_df by the geography column and then save it in the database as a table

# After completing the command, you can see that within the spark-warehouse/churn directory, there is a folder based on each of the cardinal values
# generated a folder, in which each of the folders contains the values ​​divided by the geography column (France, Germany, Spain)
churn_df.write.partitionBy('Geography').saveAsTable('churn')

In [None]:
# During the table consultation, it can be seen that the data remained unchanged, with the following changes:
# They are in ascending order and the column was positioned last
spark.sql('select * from churn').show(100)

## Analysis considerations

 In the files tab, within the collab, you will see that a spark-warehouse folder was created. Inside it there will be a directory called churn, where the partitioned files will be located in this directory.

### Observation

 In Spark, files by default are saved in the parquet model