In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, StructType, StructField


In [3]:
spark = SparkSession.builder.appName("Name").getOrCreate()

In [4]:
arrayCol = ArrayType(StringType(), False)

In [5]:
data = [
    ("James, Smith", ["Java", "Scala", "C++"], ["Spark", "Java"], "OH", "CA"),
    ("Michael, Rose", ["Spark", "Java", "C++"], ["Spark", "Java"], "NY", "NJ"),
    ("Robert, Williams", ["Csharp", "VB"], ["Spark", "Python"], "UT", "NV")
]

In [6]:
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("languagesAtSchool", ArrayType(StringType()), True),
    StructField("languagesAtWork", ArrayType(StringType()), True),
    StructField("currentState", StringType(), True),
    StructField("previousState", StringType(), True)
])

In [7]:
df = spark.createDataFrame(data=data, schema=schema)

In [8]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)



In [10]:
from pyspark.sql.functions import explode
df.select(df.name, explode(df.languagesAtSchool)).show()

In [None]:
from pyspark.sql.functions import split
df.select(split(df.name, ",").alias("nameAsArray")).show()

In [None]:
from pyspark.sql.functions import array
df.select(df.name, array(df.currentState, df.previousState).alias("States")).show()

In [None]:
from pyspark.sql.functions import array_contains
df.select(df.name, array_contains(df.languagesAtSchool, "Java").alias("array_contains")).show()