In [23]:
!pip install pyspark



In [24]:
from pyspark.sql import SparkSession

In [25]:
spark = SparkSession.builder.appName('Lendo CSV').getOrCreate()

In [26]:
caminho_csv = "./base_de_dados.csv"

df = spark.read.csv(
    path=caminho_csv,
    sep=";",
    header=True
)

df.show()

+---+--------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------+---------------+----------------+
| id|   valor| parte_debitada_nome|parte_debitada_conta|parte_debitada_banco|parte_creditada_nome|parte_creditada_conta|parte_creditada_banco|chave_pix_tipo|chave_pix_valor|  data_transacao|
+---+--------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------+---------------+----------------+
|  1|    9.93|Dra. Ana Carolina...|            79470453|              Nubank|       Maysa da Cruz|             67162333|                 Itau|           cpf|     8439752610|18/02/2022 13:28|
|  2|   15.38|        Ana Caldeira|            19689668|                Itau|        Evelyn Sales|             60005091|             Bradesco|           cpf|    27145380617|08/04/2022 01:47|
|  3|   57.58|    Arthur Goncalves|          

In [27]:
df.schema.fieldNames()

['id',
 'valor',
 'parte_debitada_nome',
 'parte_debitada_conta',
 'parte_debitada_banco',
 'parte_creditada_nome',
 'parte_creditada_conta',
 'parte_creditada_banco',
 'chave_pix_tipo',
 'chave_pix_valor',
 'data_transacao']

In [28]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

In [29]:
schema_pix = StructType([
    StructField("id", IntegerType()),
    StructField("valor", DoubleType()),
])

df = spark.read.csv(
    path=caminho_csv,
    header=True,
    sep=";",
    schema=schema_pix
)

df.show()

+---+--------+
| id|   valor|
+---+--------+
|  1|    9.93|
|  2|   15.38|
|  3|   57.58|
|  4|53705.13|
|  5|25299.69|
|  6| 7165.06|
|  7|    6.16|
|  8|  136.36|
|  9|  574.39|
| 10|   42.88|
| 11|33629.97|
| 12| 4374.56|
| 13|  507.18|
| 14|67758.87|
| 15|  815.53|
| 16|    2.73|
| 17|    0.54|
| 18|49836.72|
| 19|    9.68|
| 20| 9837.22|
+---+--------+
only showing top 20 rows


In [30]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- valor: double (nullable = true)



In [31]:
schema_pix = StructType([
    StructField("id", IntegerType()),
    StructField("valor", DoubleType()),
    StructField("parte_debitada_nome", StringType()),
    StructField("parte_debitada_cpf", StringType()),
    StructField("parte_creditada_nome", StringType()),
    StructField("parte_creditada_cpf", StringType()),
])

df = spark.read.csv(
    path=caminho_csv,
    header=True,
    sep=";",
    schema=schema_pix,
)

df.show()

+---+--------+--------------------+------------------+--------------------+--------------------+
| id|   valor| parte_debitada_nome|parte_debitada_cpf|parte_creditada_nome| parte_creditada_cpf|
+---+--------+--------------------+------------------+--------------------+--------------------+
|  1|    9.93|Dra. Ana Carolina...|          79470453|              Nubank|       Maysa da Cruz|
|  2|   15.38|        Ana Caldeira|          19689668|                Itau|        Evelyn Sales|
|  3|   57.58|    Arthur Goncalves|          18856899|            Bradesco|          Maria Melo|
|  4|53705.13|  Ana Julia Caldeira|          22834741|                Itau|   Ana Livia Almeida|
|  5|25299.69|  Srta. Nicole Pinto|           3715882|              Nubank|Srta. Ana Laura d...|
|  6| 7165.06|   Gabriela Ferreira|           2243037|              Nubank|       Larissa Souza|
|  7|    6.16|    Heloisa da Rocha|          59778949|                 BTG|Dra. Vitoria Silv...|
|  8|  136.36|Srta. Isadora Co

In [32]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- parte_debitada_nome: string (nullable = true)
 |-- parte_debitada_cpf: string (nullable = true)
 |-- parte_creditada_nome: string (nullable = true)
 |-- parte_creditada_cpf: string (nullable = true)



In [33]:
from pyspark.sql.functions import col

df_cast = df.withColumn('id', col('id').cast('int')).withColumn('valor', col('valor').cast('double'))

In [34]:
df_cast.show()

+---+--------+--------------------+------------------+--------------------+--------------------+
| id|   valor| parte_debitada_nome|parte_debitada_cpf|parte_creditada_nome| parte_creditada_cpf|
+---+--------+--------------------+------------------+--------------------+--------------------+
|  1|    9.93|Dra. Ana Carolina...|          79470453|              Nubank|       Maysa da Cruz|
|  2|   15.38|        Ana Caldeira|          19689668|                Itau|        Evelyn Sales|
|  3|   57.58|    Arthur Goncalves|          18856899|            Bradesco|          Maria Melo|
|  4|53705.13|  Ana Julia Caldeira|          22834741|                Itau|   Ana Livia Almeida|
|  5|25299.69|  Srta. Nicole Pinto|           3715882|              Nubank|Srta. Ana Laura d...|
|  6| 7165.06|   Gabriela Ferreira|           2243037|              Nubank|       Larissa Souza|
|  7|    6.16|    Heloisa da Rocha|          59778949|                 BTG|Dra. Vitoria Silv...|
|  8|  136.36|Srta. Isadora Co

### Manipulação de dados II

In [35]:
df_cast.printSchema()

root
 |-- id: integer (nullable = true)
 |-- valor: double (nullable = true)
 |-- parte_debitada_nome: string (nullable = true)
 |-- parte_debitada_cpf: string (nullable = true)
 |-- parte_creditada_nome: string (nullable = true)
 |-- parte_creditada_cpf: string (nullable = true)



In [36]:
df_cast.select('id', 'valor').show() # seleciona as colunas desejadas

+---+--------+
| id|   valor|
+---+--------+
|  1|    9.93|
|  2|   15.38|
|  3|   57.58|
|  4|53705.13|
|  5|25299.69|
|  6| 7165.06|
|  7|    6.16|
|  8|  136.36|
|  9|  574.39|
| 10|   42.88|
| 11|33629.97|
| 12| 4374.56|
| 13|  507.18|
| 14|67758.87|
| 15|  815.53|
| 16|    2.73|
| 17|    0.54|
| 18|49836.72|
| 19|    9.68|
| 20| 9837.22|
+---+--------+
only showing top 20 rows


In [43]:
from pyspark.sql.functions import round
df_dolar = df_cast.select('id', 'valor').withColumn('valor_dolar', round(col('valor') * 5, 2))

In [44]:
df_dolar.show()

+---+--------+-----------+
| id|   valor|valor_dolar|
+---+--------+-----------+
|  1|    9.93|      49.65|
|  2|   15.38|       76.9|
|  3|   57.58|      287.9|
|  4|53705.13|  268525.65|
|  5|25299.69|  126498.45|
|  6| 7165.06|    35825.3|
|  7|    6.16|       30.8|
|  8|  136.36|      681.8|
|  9|  574.39|    2871.95|
| 10|   42.88|      214.4|
| 11|33629.97|  168149.85|
| 12| 4374.56|    21872.8|
| 13|  507.18|     2535.9|
| 14|67758.87|  338794.35|
| 15|  815.53|    4077.65|
| 16|    2.73|      13.65|
| 17|    0.54|        2.7|
| 18|49836.72|   249183.6|
| 19|    9.68|       48.4|
| 20| 9837.22|    49186.1|
+---+--------+-----------+
only showing top 20 rows


In [45]:
# Dropando colunas
df_dolar.drop('valor_dolar').show()

+---+--------+
| id|   valor|
+---+--------+
|  1|    9.93|
|  2|   15.38|
|  3|   57.58|
|  4|53705.13|
|  5|25299.69|
|  6| 7165.06|
|  7|    6.16|
|  8|  136.36|
|  9|  574.39|
| 10|   42.88|
| 11|33629.97|
| 12| 4374.56|
| 13|  507.18|
| 14|67758.87|
| 15|  815.53|
| 16|    2.73|
| 17|    0.54|
| 18|49836.72|
| 19|    9.68|
| 20| 9837.22|
+---+--------+
only showing top 20 rows


In [46]:
df_dolar.withColumnRenamed('valor_dolar', 'dolar_valor').show()

+---+--------+-----------+
| id|   valor|dolar_valor|
+---+--------+-----------+
|  1|    9.93|      49.65|
|  2|   15.38|       76.9|
|  3|   57.58|      287.9|
|  4|53705.13|  268525.65|
|  5|25299.69|  126498.45|
|  6| 7165.06|    35825.3|
|  7|    6.16|       30.8|
|  8|  136.36|      681.8|
|  9|  574.39|    2871.95|
| 10|   42.88|      214.4|
| 11|33629.97|  168149.85|
| 12| 4374.56|    21872.8|
| 13|  507.18|     2535.9|
| 14|67758.87|  338794.35|
| 15|  815.53|    4077.65|
| 16|    2.73|      13.65|
| 17|    0.54|        2.7|
| 18|49836.72|   249183.6|
| 19|    9.68|       48.4|
| 20| 9837.22|    49186.1|
+---+--------+-----------+
only showing top 20 rows


In [49]:
df_cast.select('id', 'valor').filter(col('valor') > 30000).show()

+---+--------+
| id|   valor|
+---+--------+
|  4|53705.13|
| 11|33629.97|
| 14|67758.87|
| 18|49836.72|
| 27|35859.11|
| 34|58083.62|
| 36|48714.95|
| 47|38219.08|
| 52|60139.23|
| 54|95977.62|
| 55|35409.61|
| 62|57433.69|
| 71|80083.34|
| 78|81977.98|
| 79| 78559.4|
| 83|35095.43|
| 86|94736.79|
| 88|78347.58|
| 97|94586.45|
+---+--------+

