In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate()

data=data = [('James','','Smith','1991-04-01'),
  ('Michael','Rose','','2000-05-19'),
  ('Robert','','Williams','1978-09-05'),
  ('Maria','Anne','Jones','1967-12-01'),
  ('Jen','Mary','Brown','1980-02-17')
]

columns=["firstname","middlename","lastname","dob"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)
df1 = df.withColumn('year', split(df['dob'], '-').getItem(0)) \
       .withColumn('month', split(df['dob'], '-').getItem(1)) \
       .withColumn('day', split(df['dob'], '-').getItem(2))
df1.printSchema()
df1.show(truncate=False)

 # Alternatively we can do like below      
split_col = pyspark.sql.functions.split(df['dob'], '-')
df2 = df.withColumn('year', split_col.getItem(0)) \
       .withColumn('month', split_col.getItem(1)) \
       .withColumn('day', split_col.getItem(2))
df2.show(truncate=False)      

# Using split() function of Column class
split_col = pyspark.sql.functions.split(df['dob'], '-')
df3 = df.select("firstname","middlename","lastname","dob", split_col.getItem(0).alias('year'),split_col.getItem(1).alias('month'),split_col.getItem(2).alias('day'))   
df3.show(truncate=False)

"""
df4=spark.createDataFrame([("20-13-2012-monday",)], ['date',])

df4.select(split(df4.date,'^([\d]+-[\d]+-[\d])').alias('date'),
    regexp_replace(split(df4.date,'^([\d]+-[\d]+-[\d]+)').getItem(1),'-','').alias('day')).show()
    """
df4 = spark.createDataFrame([('oneAtwoBthree',)], ['str',])
df4.select(split(df4.str, '[AB]').alias('str')).show()

df4.select(split(df4.str, '[AB]',2).alias('str')).show()
df4.select(split(df4.str, '[AB]',1).alias('str')).show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)

+---------+----------+--------+----------+
|firstname|middlename|lastname|dob       |
+---------+----------+--------+----------+
|James    |          |Smith   |1991-04-01|
|Michael  |Rose      |        |2000-05-19|
|Robert   |          |Williams|1978-09-05|
|Maria    |Anne      |Jones   |1967-12-01|
|Jen      |Mary      |Brown   |1980-02-17|
+---------+----------+--------+----------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)

+---------+----------+--------+----------+----+-----+---+
|firstname|middlename|lastname|dob       |year|month|day|
+---------+----------+--------+----------+----+-----+---+
|J

In [0]:
#The code starts by importing the necessary modules: pyspark, SparkSession, and split and col functions from pyspark.sql.functions. These modules are required for creating a SparkSession, working with Spark DataFrames, and using the split() and col() functions.

#A SparkSession is created using the SparkSession.builder API. The appName parameter sets the name of the Spark application. If an existing SparkSession with the same name exists, it will be retrieved; otherwise, a new SparkSession will be created.

#Sample data is defined as a list of tuples. Each tuple represents a row of data, with elements corresponding to the columns: firstname, middlename, lastname, and dob (date of birth).

#A Spark DataFrame is created using spark.createDataFrame(data, columns), where data is the sample data and columns is the list of column names.

#The DataFrame schema is printed using df.printSchema(), which displays the data types of each column.

#The contents of the DataFrame are displayed using df.show(truncate=False), which shows all the rows and columns without truncation.

#The split() function is used to split the dob column into multiple columns: year, month, and day. The split() function is applied using the withColumn() method on the DataFrame. This is done in two different ways: first by accessing split() directly from the pyspark.sql.functions module, and then by using the split() function of the Column class obtained from df['dob'].

#The resulting DataFrames with the split columns are printed using df1.show(truncate=False) and df2.show(truncate=False).

#Another approach to splitting the dob column is demonstrated using the select() method and the split() function of the Column class. This approach creates a new DataFrame df3 with the original columns along with the split columns year, month, and day. The resulting DataFrame is printed using df3.show(truncate=False).

#The code snippet also includes some commented-out code that demonstrates the usage of the split() and regexp_replace() functions for more complex splitting and data manipulation scenarios.

#Finally, the code demonstrates the usage of the split() function on a different DataFrame df4 to split a string column based on a delimiter [AB]. The resulting DataFrame is printed using df4.select(split(df4.str, '[AB]').alias('str')).show(), along with examples of specifying the maximum number of splits using the split() function.