In [None]:
%%bash

# Instal Java
apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
pip install -q pyspark

In [None]:
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [8]:
# CREATE DATALAKE
#mkdir = make directory

%%bash
mkdir landing
mkdir processing
mkdir curated

In [14]:
# Extraction Process

%%bash
cd landing 
wget http://repositorio.dados.gov.br/seges/comprasnet_contratos/anual/comprasnet-contratos-anual-cronogramas-latest.csv

--2022-03-25 12:13:27--  http://repositorio.dados.gov.br/seges/comprasnet_contratos/anual/comprasnet-contratos-anual-cronogramas-latest.csv
Resolving repositorio.dados.gov.br (repositorio.dados.gov.br)... 189.9.7.16
Connecting to repositorio.dados.gov.br (repositorio.dados.gov.br)|189.9.7.16|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 217078190 (207M) [application/octet-stream]
Saving to: ‘comprasnet-contratos-anual-cronogramas-latest.csv’

     0K .......... .......... .......... .......... ..........  0%  182K 19m26s
    50K .......... .......... .......... .......... ..........  0%  369K 14m30s
   100K .......... .......... .......... .......... ..........  0% 15.8M 9m44s
   150K .......... .......... .......... .......... ..........  0%  373K 9m40s
   200K .......... .......... .......... .......... ..........  0% 25.2M 7m45s
   250K .......... .......... .......... .......... ..........  0% 30.7M 6m29s
   300K .......... .......... .......... .........

In [18]:
# READING DATA WITH SPARK

df = spark.read.option('header',True).csv('/content/landing/comprasnet-contratos-anual-cronogramas-latest.csv') 
df.show(truncate=False)

+--------+-----------+----------------------+----------+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+------+----------+----------+---------+
|id      |contrato_id|tipo                  |numero    |receita_despesa|observacao                                                                                                                                                                                                                                     |mesref|anoref|vencimento|retroativo|valor    |
+--------+-----------+----------------------+----------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# UNDERSTANDING THE SCHEMA

df.printSchema()

root
 |-- id: string (nullable = true)
 |-- contrato_id: string (nullable = true)
 |-- tipo: string (nullable = true)
 |-- numero: string (nullable = true)
 |-- receita_despesa: string (nullable = true)
 |-- observacao: string (nullable = true)
 |-- mesref: string (nullable = true)
 |-- anoref: string (nullable = true)
 |-- vencimento: string (nullable = true)
 |-- retroativo: string (nullable = true)
 |-- valor: string (nullable = true)



In [22]:
# UPLOADING THE OPTIMIZED DATA TO PROCESSING ZONE

df.write.mode('overwrite').format('parquet').save('/content/processing')


In [58]:
# GETTING THE DATA INFORMATION JUST FOR YEAR = 2021 AND YEAR = 2022

from pyspark.sql.functions import col, year, month, dayofmonth

df = spark.read.format('parquet').load('/content/processing')

df2 = (
    df.withColumn('id', col('id').cast('integer'))
      .withColumn('contrato_id', col('contrato_id').cast('integer'))
      .withColumn('vencimento', col('vencimento').cast('date'))
      .withColumn('valor', col('valor').cast('decimal(10,2)'))
      .withColumn('year', year('vencimento'))
      .withColumn('month', month('vencimento'))
      .withColumn('day', dayofmonth('vencimento'))
      .where('year=2021 or year=2022')
      
)

(
    df2.orderBy('year', ascending=False)
    .orderBy('month', ascending=False)
    .orderBy('day', ascending=False)
    .write.partitionBy('year','month','day')
    .mode('overwrite')
    .format('parquet')
    .save('/content/curated')
)
