In [0]:
"""
PySpark SQL provides split() function to convert delimiter separated String to an Array (StringType to ArrayType) column on DataFrame. This can be done by splitting a string column based on a delimiter like space, comma, pipe e.t.c, and converting it into ArrayType.

pyspark.sql.functions.split(str, pattern, limit=-1)
"""

In [0]:

from pyspark.sql import SparkSession
spark = SparkSession.builder \
         .appName('SparkByExamples.com') \
         .getOrCreate()

data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]

df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate= False)



root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+--------------------+--------+------+------+
|name                |dob_year|gender|salary|
+--------------------+--------+------+------+
|James, A, Smith     |2018    |M     |3000  |
|Michael, Rose, Jones|2010    |M     |4000  |
|Robert,K,Williams   |2010    |M     |4000  |
|Maria,Anne,Jones    |2005    |F     |4000  |
|Jen,Mary,Brown      |2010    |      |-1    |
+--------------------+--------+------+------+



In [0]:

from pyspark.sql.functions import split, col

df2 = df.select('dob_year',split(col("name"),",").alias("NameArray"), 'salary' ) \
    .drop("name")
df2.printSchema()
df2.show()


root
 |-- dob_year: string (nullable = true)
 |-- NameArray: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- salary: long (nullable = true)

+--------+--------------------+------+
|dob_year|           NameArray|salary|
+--------+--------------------+------+
|    2018| [James,  A,  Smith]|  3000|
|    2010|[Michael,  Rose, ...|  4000|
|    2010|[Robert, K, Willi...|  4000|
|    2005|[Maria, Anne, Jones]|  4000|
|    2010|  [Jen, Mary, Brown]|    -1|
+--------+--------------------+------+



In [0]:

df.createOrReplaceTempView("PERSON")
spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
    .show()


+--------------------+
|           NameArray|
+--------------------+
| [James,  A,  Smith]|
|[Michael,  Rose, ...|
|[Robert, K, Willi...|
|[Maria, Anne, Jones]|
|  [Jen, Mary, Brown]|
+--------------------+

