## Pyspark Programs - Focusing DF

#### Sample Program - 01 :  createDataFrame() 

In [1]:
"""
    createDataFrame() -> With ( with & without types ) & Without schema
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
        .appName("createDF") \
        .getOrCreate()

data_one = [(1, "one"), (2, "two"), (3, "three")]
data_two = [(1, "Swami"), (2, "Nathan"), (3, "Rahul")]
data_three = [{'id': 1, 'name': "spark"},
            {'id': 2, 'name': "linux"},
            {'id': 3, 'name': "sql"},
            {'id': 4, 'name': "databricks"},
            {'id': 5, 'name': "python"}]

schema_one = StructType([StructField(name = 'id', dataType = IntegerType()),
                         StructField(name = 'name', dataType = StringType())])

schema_two = ["id", "name"]

df_one = spark.createDataFrame(data = data_one, schema = schema_one)
df_two = spark.createDataFrame(data = data_two, schema = schema_two)
df_three = spark.createDataFrame(data = data_three)

df_one.printSchema()
df_one.show(5, truncate = False)
df_two.printSchema()
df_two.show(5, truncate = False)
df_three.printSchema()
df_three.show(5, truncate = False)

spark.stop()

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


24/08/23 11:50:17 WARN Utils: Your hostname, neon-HP-Pavilion-Gaming-Laptop-15-ec1xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface wlo1)
24/08/23 11:50:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 11:50:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



                                                                                

+---+-----+
|id |name |
+---+-----+
|1  |one  |
|2  |two  |
|3  |three|
+---+-----+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+------+
|id |name  |
+---+------+
|1  |Swami |
|2  |Nathan|
|3  |Rahul |
+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+----------+
|id |name      |
+---+----------+
|1  |spark     |
|2  |linux     |
|3  |sql       |
|4  |databricks|
|5  |python    |
+---+----------+



#### Sample Program - 02 : Read from external source (CSV)

In [None]:
"""
    Read CSV file with csv() and format() methods

    -> DataFrameReader = spark.read
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

# Extending (incrementally we can add)
schema = StructType().add(field='name', data_type=StringType()) \
                     .add(field='age', data_type=IntegerType()) \
                     .add(field='gender', data_type=StringType()) \
                     .add(field='salary', data_type=IntegerType())

# The path variable can get array also.  So, we can give multiple files at same time and load into single DF.
# If there is a folder with csv files, then specifying the folder itself fine to load all the csv with in it.
df_one = spark.read.csv(path = "./Dependencies/persons.csv", header = True, schema = schema)

display(df_one)
df_one.show(5, truncate=False)
df_one.printSchema()

df_two = spark.read.format('csv').option('header', True).option('inferSchema', True).load(path = "persons.csv")

display(df_two)
df_two.show(5, truncate=False)
df_two.printSchema()

df_three = spark.read.format('csv').option('header', True).option('inferSchema', True).load(path = "persons.csv")

display(df_three)
df_three.show(5, truncate=False)
df_three.printSchema()

spark.stop()

#### Sample Program - 03 : Write to external destination (CSV)

In [25]:
"""
    Write into CSV file

    -> DataFrameWriter = spark.write
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

data = [{'id': 1, 'name': "spark"},
        {'id': 2, 'name': "linux"},
        {'id': 3, 'name': "sql"},
        {'id': 4, 'name': "databricks"},
        {'id': 5, 'name': "python"}]

df = spark.createDataFrame(data = data)

df.show(truncate=False)

# If you does not want more partitional then we can use repartition function and give 1 as a input
df.write.csv(path = "./Dependencies/lang.csv", header = True, mode="ignore")

# Modes : append | overwrite | error | ignore
# Another way to attain the same functionality
# df.write.format("csv").mode('overwrite').save("lang.csv")

spark.stop()

+---+----------+
|id |name      |
+---+----------+
|1  |spark     |
|2  |linux     |
|3  |sql       |
|4  |databricks|
|5  |python    |
+---+----------+



#### Sample Program - 04 : Read from external source (JSON)

In [37]:
"""
    Read JSON file with read.json() and format() methods
"""

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

# Single Line Json file
df = spark.read.json(path = "./Dependencies/products_singleline.json")
df.show(5, truncate=False)
df.printSchema()

# Multi Line Json file (with pretty print)
df1 = spark.read.json(path = "./Dependencies/products_multiline.json", multiLine = True)
df.show(5, truncate=False)
df.printSchema()

# Wild card (all json files inside the folder)
# Multi json files as an array loading at once

spark.stop()

24/08/09 11:15:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+---------------+---+----------------------+------+--------+
|category       |id |name                  |price |quantity|
+---------------+---+----------------------+------+--------+
|Electronics    |1  |iPhone 12             |899.99|10      |
|Clothing       |2  |Nike Air Max 90       |119.99|25      |
|Home Appliances|3  |KitchenAid Stand Mixer|299.99|5       |
|Books          |4  |The Great Gatsby      |12.99 |50      |
|Beauty         |5  |L'Oreal Paris Mascara |9.99  |100     |
+---------------+---+----------------------+------+--------+
only showing top 5 rows

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+----------------------+------+--------+
|category       |id |name                  |price |quantity|
+---------------+---+----------------------+------+--------+
|Electronics    |1  |iPhone 12             |899.99|10 

#### Sample Program - 05 : Write to external destination (JSON)

In [40]:
"""
    Write result into JSON file with writer object
"""

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

data = [(1, 'swami'), (2, 'nathan')]
schema = ['id', 'name']

df = spark.createDataFrame(data = data, schema = schema)
display(df)
df.show()

df.write.json("./Dependencies/json_output.json", mode = 'ignore')

spark.stop()

24/08/09 11:21:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


DataFrame[id: bigint, name: string]

                                                                                

+---+------+
| id|  name|
+---+------+
|  1| swami|
|  2|nathan|
+---+------+



#### Sample Program - 06 : Read from external source (Parquet)

In [41]:
"""
"""

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

# Wildcard reading of the parquet file also we can do
df = spark.read.parquet('./Dependenciesproducts_parquet.parquet/part-00000-1f07d3ad-5b1e-4437-add7-979e91115b3c-c000.snappy.parquet')
df.show()
df.printSchema()
display(df)

spark.stop()

24/08/09 11:27:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
|         Sports|  6|            Yoga Mat| 29.99|      30|
|    Electronics|  7| Samsung 4K Smart TV|799.99|       8|
|       Clothing|  8|        Levi's Jeans| 49.99|      15|
|Home Appliances|  9|Dyson Vacuum Cleaner|399.99|       3|
|          Books| 10| Harry Potter Series| 15.99|      20|
|         Beauty| 11|        MAC Lipstick| 16.99|      75|
|         Sports| 12|Adidas Running Shoes| 59.99|      22|
|    Electronics| 13|       PlayStation 5|499.99|      12|
|       Clothing| 14|   Hooded Sweatshirt| 34.99|      1

DataFrame[category: string, id: bigint, name: string, price: double, quantity: bigint]

#### Sample Program - 07 : Write to external destination (Parquet)

In [None]:
"""
    Write result into Parquet file with writer object
"""

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

data = [(1, 'swami'), (2, 'nathan')]
schema = ['id', 'name']

df = spark.createDataFrame(data = data, schema = schema)
display(df)
df.show()


df.write.parquet("json_output.parquet", mode = 'ignore')

spark.stop()

#### Sample Program - 08 : show()

In [48]:
"""
    Operations of show()
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType

spark = SparkSession.builder \
        .appName("csv") \
        .getOrCreate()

data = [{'id': 1, 'name': "Apache spark is alternate to the MapReduce in Hadoop"},
        {'id': 2, 'name': "linux is a kernal not an Operating System"},
        {'id': 3, 'name': "SQL stands for Structured Query Language"},
        {'id': 4, 'name': "databricks company started by the one who invented Spark"},
        {'id': 5, 'name': "python is a programming language written before java"}]

df = spark.createDataFrame(data=data)

# show prints 20 character as and 20 rows as a default
# show can get integers character and boolean character for the truncate argument
# show have n property to get no of rows to print
# show can able to print the table vertically using vertical = True parameter

df.show(n = 3, truncate = 8)
df.show(n = 3, truncate = False)
df.show(vertical = True)

spark.stop()

24/08/09 12:48:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---+--------+
| id|    name|
+---+--------+
|  1|Apach...|
|  2|linux...|
|  3|SQL s...|
+---+--------+
only showing top 3 rows

+---+----------------------------------------------------+
|id |name                                                |
+---+----------------------------------------------------+
|1  |Apache spark is alternate to the MapReduce in Hadoop|
|2  |linux is a kernal not an Operating System           |
|3  |SQL stands for Structured Query Language            |
+---+----------------------------------------------------+
only showing top 3 rows

-RECORD 0--------------------
 id   | 1                    
 name | Apache spark is a... 
-RECORD 1--------------------
 id   | 2                    
 name | linux is a kernal... 
-RECORD 2--------------------
 id   | 3                    
 name | SQL stands for St... 
-RECORD 3--------------------
 id   | 4                    
 name | databricks compan... 
-RECORD 4--------------------
 id   | 5                    
 name | pyth

#### Sample Program - 09 : withColumn()

In [62]:
"""
    Transformation
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

spark = SparkSession.builder \
        .appName("createDF") \
        .getOrCreate()

data = [{'id': 1, 'name': "spark", 'salary': 30_000},
        {'id': 2, 'name': "linux", 'salary': 25_000},
        {'id': 3, 'name': "sql", 'salary': 20_000},
        {'id': 4, 'name': "databricks", 'salary': 15_000},
        {'id': 5, 'name': "python", 'salary': 50_000}]

df = spark.createDataFrame(data = data)

# Taking an RDD using withColumn and changing the long data type to Integer
df1 = df.withColumn(colName = 'id', col=col('id').cast('Integer')) \
        .withColumn(colName = 'salary', col=col('salary').cast('Integer')) \
        .withColumn(colName = 'salary', col=col('salary') * 2) 

# If colName is already exists it will update the value from the expression given in col parameter else it will create new
df1 = df1.withColumn(colName = 'country', col = lit('india'))

df1 = df1.withColumn(colName = 'this_month_salary', col = col('salary'))

df1.printSchema()
df1.show(5)

spark.stop()

24/08/09 13:14:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- country: string (nullable = false)
 |-- this_month_salary: integer (nullable = true)



                                                                                

+---+----------+------+-------+-----------------+
| id|      name|salary|country|this_month_salary|
+---+----------+------+-------+-----------------+
|  1|     spark| 60000|  india|            60000|
|  2|     linux| 50000|  india|            50000|
|  3|       sql| 40000|  india|            40000|
|  4|databricks| 30000|  india|            30000|
|  5|    python|100000|  india|           100000|
+---+----------+------+-------+-----------------+



#### Sample Program - 10 : withColumnRenamed()

In [63]:
"""
    Transformation
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

spark = SparkSession.builder \
        .appName("createDF") \
        .getOrCreate()

data = [{'id': 1, 'name': "spark", 'salary': 30_000},
        {'id': 2, 'name': "linux", 'salary': 25_000},
        {'id': 3, 'name': "sql", 'salary': 20_000},
        {'id': 4, 'name': "databricks", 'salary': 15_000},
        {'id': 5, 'name': "python", 'salary': 50_000}]

df = spark.createDataFrame(data = data)

df1 = df.withColumnRenamed('salary', 'month_salary')

df1.show()

spark.stop()

24/08/09 14:04:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---+----------+------------+
| id|      name|month_salary|
+---+----------+------------+
|  1|     spark|       30000|
|  2|     linux|       25000|
|  3|       sql|       20000|
|  4|databricks|       15000|
|  5|    python|       50000|
+---+----------+------------+



#### Sample Program - 11 : Schema Structures

In [67]:
"""
    Schema Structures
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder \
        .appName("createDF") \
        .getOrCreate()

data = [{'id': 1, 'name': "spark", 'salary': 30_000},
        {'id': 2, 'name': "linux", 'salary': 25_000},
        {'id': 3, 'name': "sql", 'salary': 20_000},
        {'id': 4, 'name': "databricks", 'salary': 15_000},
        {'id': 5, 'name': "python", 'salary': 50_000}]

schema = StructType([
    StructField(name = 'id', dataType = IntegerType()),
    StructField(name = 'name', dataType = StringType()),
    StructField(name = 'salary', dataType = IntegerType())
])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()

# Complex data types (Multivalued data in SQL)

data_one = [{'id': 1, 'name': ('apache', 'spark'), 'salary': 30_000},
            {'id': 2, 'name': ('os', 'linux'), 'salary': 25_000},
            {'id': 3, 'name': ('sql', 'Mysql'), 'salary': 20_000},
            {'id': 4, 'name': ('UI', 'Databricks'), 'salary': 15_000},
            {'id': 5, 'name': ('lang', 'python'), 'salary': 50_000}]

structName = StructType([StructField('First', StringType()), StructField('Last', StringType())])

schema_one = StructType([
    StructField(name = 'id', dataType = IntegerType()),
    StructField(name = 'name', dataType = structName),
    StructField(name = 'salary', dataType = IntegerType())
])

df_one = spark.createDataFrame(data = data_one, schema = schema_one)
df_one.printSchema()

spark.stop()

24/08/09 14:15:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- First: string (nullable = true)
 |    |-- Last: string (nullable = true)
 |-- salary: integer (nullable = true)



#### Sample Program - 12 : ArrayType

In [81]:
"""
    ArrayType
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.sql.functions import col, array

spark = SparkSession.builder \
        .appName("ArrayType") \
        .getOrCreate()

data = [{'id': 1, 'name': ('apache', 'spark'), 'salary': 30_000, 'extra': [1,2,3]},
        {'id': 2, 'name': ('os', 'linux'), 'salary': 25_000, 'extra': [3,4,5]},
        {'id': 3, 'name': ('sql', 'Mysql'), 'salary': 20_000, 'extra': [6,7,8]},
        {'id': 4, 'name': ('UI', 'Databricks'), 'salary': 15_000, 'extra': [9,1,0]},
        {'id': 5, 'name': ('lang', 'python'), 'salary': 50_000, 'extra': [1,1,1]}]

structName = StructType([StructField('First', StringType()), StructField('Last', StringType())])

schema = StructType([
    StructField(name = 'id', dataType = IntegerType()),
    StructField(name = 'name', dataType = structName),
    StructField(name = 'salary', dataType = IntegerType()),
    StructField(name = 'extra', dataType = ArrayType(IntegerType()))])

df = spark.createDataFrame(data = data)

df1= df.withColumn(colName = 'first_sector', col=col('extra')[0])

# To combine two separate column to single multivalued column
df1 = df1.withColumn(colName = 'altered', col=array(col('id'), col('first_sector')))

df1.show()
df1.printSchema()

spark.stop()

                                                                                

+---------+---+----------------+------+------------+-------+
|    extra| id|            name|salary|first_sector|altered|
+---------+---+----------------+------+------------+-------+
|[1, 2, 3]|  1| {apache, spark}| 30000|           1| [1, 1]|
|[3, 4, 5]|  2|     {os, linux}| 25000|           3| [2, 3]|
|[6, 7, 8]|  3|    {sql, Mysql}| 20000|           6| [3, 6]|
|[9, 1, 0]|  4|{UI, Databricks}| 15000|           9| [4, 9]|
|[1, 1, 1]|  5|  {lang, python}| 50000|           1| [5, 1]|
+---------+---+----------------+------+------------+-------+

root
 |-- extra: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- _1: string (nullable = true)
 |    |-- _2: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- first_sector: long (nullable = true)
 |-- altered: array (nullable = false)
 |    |-- element: long (containsNull = true)



#### Sample Program - 13 : Other functions of ArrayType

In [None]:
"""
    -> explode()
    -> split()
    -> arry()
    -> array_contains()
"""



#### Sample Program - 14 : MapType and other funtions of MapType

In [None]:
"""
    MapType Column
"""

"""
    -> map_keys()
    -> map_values()
    -> explode()
"""

#### Sample Program - 15 : Row and Column Class (col) and how to access it

In [94]:
"""
"""

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder \
        .appName("Row") \
        .getOrCreate()

# Way 1 of using Row
row1 = Row(name = 'naruto', salary = 3000, age = 23)
row2 = Row(name = 'sasuke', salary = 2000, age = 24)

data = [row1, row2]

df = spark.createDataFrame(data)

df.show()
df.printSchema()

# Another Way (2)
person = Row('name', 'age')

row1 = person("itachi", 23)
row2 = person("nagato", 22)

df_one = spark.createDataFrame(data = [row1, row2])
df_one.show()

# Another way (3)

data_last = [
    Row(name = "Yellow Flash", prop = Row(age = 30, power = "teleportation")),
    Row(name = "White Fang", prop = Row(age = 40, power = "Unknown"))
]

df_last = spark.createDataFrame(data = data_last)
df_last.show()
df_last.printSchema()

spark.stop()

24/08/09 18:08:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+------+------+---+
|  name|salary|age|
+------+------+---+
|naruto|  3000| 23|
|sasuke|  2000| 24|
+------+------+---+

root
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)

+------+---+
|  name|age|
+------+---+
|itachi| 23|
|nagato| 22|
+------+---+

+------------+-------------------+
|        name|               prop|
+------------+-------------------+
|Yellow Flash|{30, teleportation}|
|  White Fang|      {40, Unknown}|
+------------+-------------------+

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- age: long (nullable = true)
 |    |-- power: string (nullable = true)



#### Sample Program - 16 : when() and otherwise() : Similar to CASE statement in SQL

In [5]:
"""
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import when

spark = SparkSession.builder \
        .appName("when_otherwise") \
        .getOrCreate()

data = [{'id': 1, 'name': "spark", 'salary': 30_000, "gender": 'M'},
        {'id': 2, 'name': "linux", 'salary': 25_000, "gender": 'F'},
        {'id': 3, 'name': "sql", 'salary': 20_000, "gender": ''},
        {'id': 4, 'name': "databricks", 'salary': 15_000, "gender": 'F'},
        {'id': 5, 'name': "python", 'salary': 50_000, "gender": 'M'}]

df = spark.createDataFrame(data = data)

df.show()

df1 = df.select(df.id, df.name, df.salary, 
                when(df.gender=='M', 'Male')
               .when(df.gender=='F', 'Female')
               .otherwise('Unknown').alias('Gender'))
df1.show()

spark.stop()


                                                                                

+------+---+----------+------+
|gender| id|      name|salary|
+------+---+----------+------+
|     M|  1|     spark| 30000|
|     F|  2|     linux| 25000|
|      |  3|       sql| 20000|
|     F|  4|databricks| 15000|
|     M|  5|    python| 50000|
+------+---+----------+------+

+---+----------+------+-------+
| id|      name|salary| Gender|
+---+----------+------+-------+
|  1|     spark| 30000|   Male|
|  2|     linux| 25000| Female|
|  3|       sql| 20000|Unknown|
|  4|databricks| 15000| Female|
|  5|    python| 50000|   Male|
+---+----------+------+-------+



#### Sample Program - 17 : Other column functions focusing DF

In [None]:
"""
    df.columnname.method()
    
    -> alias()
    -> asc()
    -> desc()
    -> cast()
    -> like()
"""

"""
    -> where() / filter()
    -> distinct() / dropDuplicates()
    -> orderBy() / sort()  --- default ascending order ---- df.sort(df.col_one, df.col_two.desc())
    -> union() / unionAll() / unionByName()
    -> groupBy()  --- have many things, max(), min(), count(), agg()
    -> select() --- you can pass column name as list, df.columnname, df['columnname'] or col() | select * | [ col for col in df.columns ]
    -> join() --- join(another table, on condition, what join)
    -> pivot() --- ungrouping 
    -> stack() --- unpivoting
    -> fill() / fillna() --- changes null values to some other mentioned value
    -> sample()
    -> collect() --- returns a array / list not df or rdd (so it was an action)
    -> transform() / pyspark.sql.functions.transform() ---
    -> createOrReplaceTempView() / createOrReplaceGlobalTempView() [used across all section as global]
    -> from_json()  --- we can convert the json data column to MapType() or Structure Type()
    -> to_json() --- 
    -> json_tuple()
    -> get_json_object()
    -> current_date() / date_format() / to_date() 
    -> datediff() / months_between() / add_months() / date_add() / year() / month()
    -> current_timestamp(), to_timestamp(), hour(), minute(), second()
    -> row_number() --- adds the row
    -> rank() / dense_rank() --- based upon the column specified it assign the value (dense rank don't skip)
    -> over()
    
    Aggregate methods:
    -----------------
    -> approx_count_distinct() / avg() / collect_list() / collect_set() / count() / countDistinct()
    

    -> spark.catalog.currentDatabase()
    -> spark.catalog.listTables()
    
"""