In [1]:
!pip install pyspark



In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, to_date

In [37]:
spark = SparkSession.builder.appName("Tratamento de Dados").getOrCreate()

In [38]:
df = spark.read.format('csv').option("header", "true").option("delimiter", ";").load("employee_data_100.csv")

In [39]:
df.show()

+-----+--------+---------+----+----------+----+----+------+
|EMPNO|   ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|
+-----+--------+---------+----+----------+----+----+------+
| 7369|    LOWE|  ANALYST|7566|12/01/1986|3501|NULL|    50|
| 7370|   SCOTT|PRESIDENT|7902|14/07/1999|4738|1400|    20|
| 7371|   CHANG|  ANALYST|7902|04/09/1985|4908| 500|    40|
| 7372|  ROGERS|  MANAGER|7698|05/02/1994|1787|1400|    40|
| 7373|  WAGNER| SALESMAN|7839|29/06/2014|2156| 300|    50|
| 7374| JOHNSON| SALESMAN|7839|21/01/2008|4803|NULL|    20|
| 7375|     RAY| SALESMAN|7566|22/04/1990|4374|NULL|    50|
| 7376|   BROCK|  ANALYST|7902|09/10/2024|3612| 300|    50|
| 7377|   LEWIS| SALESMAN|7566|27/02/1993|4291|1400|    50|
| 7378|WILLIAMS| SALESMAN|7566|04/01/2002|2404| 500|    50|
| 7379|   SMITH|    CLERK|7566|09/03/1989|3698|NULL|    20|
| 7380|ARELLANO|    CLERK|7566|18/02/2009|3154|NULL|    50|
| 7381| LINDSEY|  ANALYST|7839|16/10/2000|3884|NULL|    30|
| 7382|ESPINOZA|  ANALYST|7839|13/07/201

In [40]:
df = df.withColumn("HIREDATE", to_date("HIREDATE", "dd/MM/yyyy")).fillna({"HIREDATE": "9999-12-31"})

In [41]:
df.show()

+-----+--------+---------+----+----------+----+----+------+
|EMPNO|   ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|
+-----+--------+---------+----+----------+----+----+------+
| 7369|    LOWE|  ANALYST|7566|1986-01-12|3501|NULL|    50|
| 7370|   SCOTT|PRESIDENT|7902|1999-07-14|4738|1400|    20|
| 7371|   CHANG|  ANALYST|7902|1985-09-04|4908| 500|    40|
| 7372|  ROGERS|  MANAGER|7698|1994-02-05|1787|1400|    40|
| 7373|  WAGNER| SALESMAN|7839|2014-06-29|2156| 300|    50|
| 7374| JOHNSON| SALESMAN|7839|2008-01-21|4803|NULL|    20|
| 7375|     RAY| SALESMAN|7566|1990-04-22|4374|NULL|    50|
| 7376|   BROCK|  ANALYST|7902|2024-10-09|3612| 300|    50|
| 7377|   LEWIS| SALESMAN|7566|1993-02-27|4291|1400|    50|
| 7378|WILLIAMS| SALESMAN|7566|2002-01-04|2404| 500|    50|
| 7379|   SMITH|    CLERK|7566|1989-03-09|3698|NULL|    20|
| 7380|ARELLANO|    CLERK|7566|2009-02-18|3154|NULL|    50|
| 7381| LINDSEY|  ANALYST|7839|2000-10-16|3884|NULL|    30|
| 7382|ESPINOZA|  ANALYST|7839|2013-07-1

In [42]:
df = df.withColumn("YEAR", date_format("HIREDATE", "yyyy")).withColumn("MONTH", date_format("HIREDATE", "MM"))
df.show()

+-----+--------+---------+----+----------+----+----+------+----+-----+
|EMPNO|   ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|YEAR|MONTH|
+-----+--------+---------+----+----------+----+----+------+----+-----+
| 7369|    LOWE|  ANALYST|7566|1986-01-12|3501|NULL|    50|1986|   01|
| 7370|   SCOTT|PRESIDENT|7902|1999-07-14|4738|1400|    20|1999|   07|
| 7371|   CHANG|  ANALYST|7902|1985-09-04|4908| 500|    40|1985|   09|
| 7372|  ROGERS|  MANAGER|7698|1994-02-05|1787|1400|    40|1994|   02|
| 7373|  WAGNER| SALESMAN|7839|2014-06-29|2156| 300|    50|2014|   06|
| 7374| JOHNSON| SALESMAN|7839|2008-01-21|4803|NULL|    20|2008|   01|
| 7375|     RAY| SALESMAN|7566|1990-04-22|4374|NULL|    50|1990|   04|
| 7376|   BROCK|  ANALYST|7902|2024-10-09|3612| 300|    50|2024|   10|
| 7377|   LEWIS| SALESMAN|7566|1993-02-27|4291|1400|    50|1993|   02|
| 7378|WILLIAMS| SALESMAN|7566|2002-01-04|2404| 500|    50|2002|   01|
| 7379|   SMITH|    CLERK|7566|1989-03-09|3698|NULL|    20|1989|   03|
| 7380

In [46]:
!pip install delta-spark

Collecting delta-spark
  Downloading delta_spark-3.3.0-py3-none-any.whl.metadata (2.0 kB)
Downloading delta_spark-3.3.0-py3-none-any.whl (21 kB)
Installing collected packages: delta-spark
Successfully installed delta-spark-3.3.0


In [49]:
from google.colab import drive
from delta import *

In [50]:
df.write.format("parquet").partitionBy("YEAR", "MONTH").mode("overwrite").save("/content/sample_data/test_partitions")