In [0]:
# Databricks have in-build spark session calles spark
# we are using that spark session
spark

In [0]:
# creating a dataframe using data and columns

data = [
    ("emp1", "17-12-1980", 29),
    ("emp2","null", 30)
]
columns = ["name", "DOJ", "age"]
df = spark.createDataFrame(data=data, schema=columns)
df.display()

name,DOJ,age
emp1,17-12-1980,29
emp2,,30


In [0]:
# here we change date format (dd-mm-yyyy to yyyy-MM-dd(spark date format))
from pyspark.sql.functions import to_date, date_format
df = df.withColumn("DOJ", to_date("DOJ", "dd-mm-yyyy")).fillna({"DOJ":"1999-02-19"})
df.display()

name,DOJ,age
emp1,1980-01-17,29
emp2,1999-02-19,30


In [0]:
# here we separate date, month and year from date
df = df.withColumn("year", date_format("DOJ", "yyyy"))\
    .withColumn("month", date_format("DOJ", "MM"))\
    .withColumn("date", date_format("DOJ", "dd"))

df.display()

name,DOJ,age,year,month,date
emp1,1980-01-17,29,1980,1,17
emp2,1999-02-19,30,1999,2,19


In [0]:
# creating a partiting by year and month
df.write.format("csv").partitionBy("year", "month").mode("overwrite").saveAsTable("emp_tb")

In [0]:
%fs
ls /user/hive/warehouse/emp_tb/year=1980/month=01/part-00003-tid-7167485234245974727-084e93b3-4cd0-44af-b150-3ee2e405de48-27-1.c000.csv

path,name,size,modificationTime
dbfs:/user/hive/warehouse/emp_tb/year=1980/month=01/part-00003-tid-7167485234245974727-084e93b3-4cd0-44af-b150-3ee2e405de48-27-1.c000.csv,part-00003-tid-7167485234245974727-084e93b3-4cd0-44af-b150-3ee2e405de48-27-1.c000.csv,22,1701781360000


In [0]:
new_df = spark.read.csv("/user/hive/warehouse/emp_tb/year=1980/month=01/part-00003-tid-7167485234245974727-084e93b3-4cd0-44af-b150-3ee2e405de48-27-1.c000.csv", inferSchema=True, header=True)
new_df.display()

emp1,1980-01-17,29,17


In [0]:
%sql select * from emp_tb where year=1980

name,DOJ,age,date,year,month
emp1,1980-01-17,29,17,1980,1
