In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("spark_dataframe_api").master("local[*]").getOrCreate()

24/05/13 11:43:00 WARN Utils: Your hostname, pc-jailton resolves to a loopback address: 127.0.1.1; using 10.0.1.3 instead (on interface enp3s0)
24/05/13 11:43:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/13 11:43:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/13 11:43:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = (
    spark
    .read
    .option(key="delimiter", value=";")
    .option(key="header", value="true")
    .option(key="inferSchema", value="true")
    .option(key="encoding", value="ISO-8859-1")
    .csv("./data/precos-gasolina-etanol-3.csv")
)

In [4]:
df.printSchema()

root
 |-- Regiao - Sigla: string (nullable = true)
 |-- Estado - Sigla: string (nullable = true)
 |-- Municipio: string (nullable = true)
 |-- Revenda: string (nullable = true)
 |-- CNPJ da Revenda: string (nullable = true)
 |-- Nome da Rua: string (nullable = true)
 |-- Numero Rua: string (nullable = true)
 |-- Complemento: string (nullable = true)
 |-- Bairro: string (nullable = true)
 |-- Cep: string (nullable = true)
 |-- Produto: string (nullable = true)
 |-- Data da Coleta: string (nullable = true)
 |-- Valor de Venda: string (nullable = true)
 |-- Valor de Compra: string (nullable = true)
 |-- Unidade de Medida: string (nullable = true)
 |-- Bandeira: string (nullable = true)



In [6]:
# creating our temporary view
df.createOrReplaceTempView("combustiveis")

In [14]:
spark.sql("""
    select `Estado - Sigla`, `Produto`, `Valor de Compra`, `Valor de Venda`, `Unidade de Medida` from combustiveis;
""").show()

+--------------+------------------+---------------+--------------+-----------------+
|Estado - Sigla|           Produto|Valor de Compra|Valor de Venda|Unidade de Medida|
+--------------+------------------+---------------+--------------+-----------------+
|            PB|          GASOLINA|           NULL|          5,59|       R$ / litro|
|            PB|GASOLINA ADITIVADA|           NULL|          5,63|       R$ / litro|
|            PB|            ETANOL|           NULL|          3,99|       R$ / litro|
|            PB|          GASOLINA|           NULL|          5,59|       R$ / litro|
|            PB|GASOLINA ADITIVADA|           NULL|          5,79|       R$ / litro|
|            PB|            ETANOL|           NULL|          3,89|       R$ / litro|
|            PB|GASOLINA ADITIVADA|           NULL|          5,59|       R$ / litro|
|            PB|          GASOLINA|           NULL|          5,49|       R$ / litro|
|            PB|GASOLINA ADITIVADA|           NULL|          5,65

In [15]:
spark.sql("""
    select * from combustiveis where `Valor de Compra` is not null;
""").show()

+--------------+--------------+---------+-------+---------------+-----------+----------+-----------+------+---+-------+--------------+--------------+---------------+-----------------+--------+
|Regiao - Sigla|Estado - Sigla|Municipio|Revenda|CNPJ da Revenda|Nome da Rua|Numero Rua|Complemento|Bairro|Cep|Produto|Data da Coleta|Valor de Venda|Valor de Compra|Unidade de Medida|Bandeira|
+--------------+--------------+---------+-------+---------------+-----------+----------+-----------+------+---+-------+--------------+--------------+---------------+-----------------+--------+
+--------------+--------------+---------+-------+---------------+-----------+----------+-----------+------+---+-------+--------------+--------------+---------------+-----------------+--------+



In [17]:
view_prices = spark.sql("""
    select
        `Estado - Sigla`,
        `Produto`,
        regexp_replace(`Valor de Venda`, ",", ".") as `Valor de Venda`,
        `Unidade de Medida`
    from combustiveis;
""")

In [19]:
view_prices.createOrReplaceTempView("view_prices")
view_prices.show()

+--------------+------------------+--------------+-----------------+
|Estado - Sigla|           Produto|Valor de Venda|Unidade de Medida|
+--------------+------------------+--------------+-----------------+
|            PB|          GASOLINA|          5.59|       R$ / litro|
|            PB|GASOLINA ADITIVADA|          5.63|       R$ / litro|
|            PB|            ETANOL|          3.99|       R$ / litro|
|            PB|          GASOLINA|          5.59|       R$ / litro|
|            PB|GASOLINA ADITIVADA|          5.79|       R$ / litro|
|            PB|            ETANOL|          3.89|       R$ / litro|
|            PB|GASOLINA ADITIVADA|          5.59|       R$ / litro|
|            PB|          GASOLINA|          5.49|       R$ / litro|
|            PB|GASOLINA ADITIVADA|          5.65|       R$ / litro|
|            PB|            ETANOL|          4.09|       R$ / litro|
|            PR|          GASOLINA|          6.15|       R$ / litro|
|            PR|GASOLINA ADITIVADA

In [24]:
view_prices_difference = spark.sql("""
    select
        `Estado - Sigla`,
        `Produto`,
        `Unidade de Medida`,
        MAX(`Valor de Venda`) as max_value,
        MIN(`Valor de Venda`) as min_value,
        MAX(`Valor de Venda`) - MIN(`Valor de Venda`) as difference
    from view_prices
    group by all
    order by difference DESC;
""")

In [26]:
view_prices_difference.show(5)

+--------------+------------------+-----------------+---------+---------+------------------+
|Estado - Sigla|           Produto|Unidade de Medida|max_value|min_value|        difference|
+--------------+------------------+-----------------+---------+---------+------------------+
|            SP|          GASOLINA|       R$ / litro|     7.97|     4.59|              3.38|
|            SP|GASOLINA ADITIVADA|       R$ / litro|     7.99|     4.79|               3.2|
|            SP|            ETANOL|       R$ / litro|     5.69|     2.69|3.0000000000000004|
|            RJ|          GASOLINA|       R$ / litro|     7.49|     4.99|               2.5|
|            PA|            ETANOL|       R$ / litro|      5.9|     3.45|              2.45|
+--------------+------------------+-----------------+---------+---------+------------------+
only showing top 5 rows

