In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
         .appName('SparkByExamples.com') \
         .getOrCreate()

data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)

from pyspark.sql.functions import split, col
df2 = df.select(split(col("name"),",").alias("NameArray")) \
    .drop("name")
df2.printSchema()
df2.show()

df.createOrReplaceTempView("PERSON")
spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
    .show()
    

root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+--------------------+--------+------+------+
|name                |dob_year|gender|salary|
+--------------------+--------+------+------+
|James, A, Smith     |2018    |M     |3000  |
|Michael, Rose, Jones|2010    |M     |4000  |
|Robert,K,Williams   |2010    |M     |4000  |
|Maria,Anne,Jones    |2005    |F     |4000  |
|Jen,Mary,Brown      |2010    |      |-1    |
+--------------------+--------+------+------+

root
 |-- NameArray: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K, Willi...|
|[Maria, Anne, Jones]|
|  [Jen, Mary, Brown]|
+--------------------+

+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K

In [0]:
#The code begins by importing the SparkSession class from the pyspark.sql module and creating a SparkSession using the SparkSession.builder API. The appName parameter sets the name of the Spark application. If an existing SparkSession with the same name exists, it will be retrieved; otherwise, a new SparkSession will be created.

#A list of tuples called data is defined, representing the rows of the DataFrame. Each tuple contains values for the columns "name", "dob_year", "gender", and "salary". This data is used to create the DataFrame.

#The spark.createDataFrame() method is used to create a DataFrame named df from the data list and the specified column names in the columns list.

#The schema of the DataFrame is printed using the printSchema() method.

#The content of the DataFrame is displayed using the show() method with truncate=False, which ensures that all column values are fully displayed.

#The split() function is imported from pyspark.sql.functions. This function splits the values in the "name" column based on the comma delimiter and returns an array of substrings.

#The df.select() method is used to select the "name" column and apply the split() function to it. The resulting column is named "NameArray". The original "name" column is dropped using the drop() method.

#The schema of the new DataFrame df2 is printed using the printSchema() method.

#The content of the new DataFrame df2 is displayed using the show() method.

#The original DataFrame df is registered as a temporary view named "PERSON" using the createOrReplaceTempView() method.

#A Spark SQL query is executed using the spark.sql() method. The query selects the "name" column and applies the SPLIT() function to it. The resulting column is named "NameArray". The query results are displayed using the show() method.

#Overall, this code showcases how to split a string column into an array of substrings using the split() function in PySpark, both with DataFrame transformations and Spark SQL queries.
