In [29]:
import findspark
findspark.init()  # Initialize findspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark in Jupyter") \
    .getOrCreate()

# Verify SparkSession is created
print(spark)

<pyspark.sql.session.SparkSession object at 0x71ce3c2171c0>


In [30]:
from pyspark.sql.functions import col

In [31]:
from pyspark.sql.types import *

In [32]:
from pyspark.sql.functions import array

In [33]:
data = [("abc",[1,2]),("xyz",[4,5]),("mno",[7,8])]
# schema = ["id","number"]

schema = StructType([\
    StructField(name="id",dataType=StringType(),nullable=True),\
    StructField(name = "numbers",dataType= ArrayType(IntegerType()),nullable =True)
])

In [34]:
df =spark.createDataFrame(data = data,schema =schema)

In [35]:
df.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
|xyz| [4, 5]|
|mno| [7, 8]|
+---+-------+



In [36]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [37]:
df2 = df.withColumn("FirstValue",col("numbers")[0])

In [41]:
df.select(col("numbers")[0].alias('num1')).show()

+----+
|num1|
+----+
|   1|
|   4|
|   7|
+----+



### creating an array of numbers from the first columns 

In [42]:
data = [(1,2),(3,4)]
schema = ["num1","num2"]

df =spark.createDataFrame(data,schema)
df.show()

+----+----+
|num1|num2|
+----+----+
|   1|   2|
|   3|   4|
+----+----+



In [43]:
df.withColumn("numbers",array(df.num1,df.num2)).show()

+----+----+-------+
|num1|num2|numbers|
+----+----+-------+
|   1|   2| [1, 2]|
|   3|   4| [3, 4]|
+----+----+-------+



### Array type functions
#### explode()
#### split()
#### array()
#### array_contains()

In [44]:
df.show()

+----+----+
|num1|num2|
+----+----+
|   1|   2|
|   3|   4|
+----+----+



In [45]:
data = [(1,"xyz",['dotnet','azure']),(2,'abc',["java","aws"])]
schema = ["id","name","skills"]

df = spark.createDataFrame(data =data,schema =schema)

In [46]:
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+----+---------------+
| id|name|         skills|
+---+----+---------------+
|  1| xyz|[dotnet, azure]|
|  2| abc|    [java, aws]|
+---+----+---------------+



#### explode(_)

In [47]:
from pyspark.sql.functions import explode

In [48]:
df1=df.withColumn("skill",explode(df.skills))
df1.show()

+---+----+---------------+------+
| id|name|         skills| skill|
+---+----+---------------+------+
|  1| xyz|[dotnet, azure]|dotnet|
|  1| xyz|[dotnet, azure]| azure|
|  2| abc|    [java, aws]|  java|
|  2| abc|    [java, aws]|   aws|
+---+----+---------------+------+



#### split(_)

In [49]:
data = [(1,"xyz","dotnet,azure"),(2,"abc","java,aws")]
schema = ["id","name","skills"]

df = spark.createDataFrame(data = data,schema = schema)
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)

+---+----+------------+
| id|name|      skills|
+---+----+------------+
|  1| xyz|dotnet,azure|
|  2| abc|    java,aws|
+---+----+------------+



In [50]:
from pyspark.sql.functions import split

In [51]:
df1 = df.withColumn("skillsArray",split(df.skills,','))

In [52]:
df1.show()

+---+----+------------+---------------+
| id|name|      skills|    skillsArray|
+---+----+------------+---------------+
|  1| xyz|dotnet,azure|[dotnet, azure]|
|  2| abc|    java,aws|    [java, aws]|
+---+----+------------+---------------+



In [None]:
df.withColumn("skillsArray",split(df.skills, ))

In [54]:
df3 = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
df3.select(split(df3.s, '[ABC]', 2).alias('s')).collect()
df3.select(split(df3.s, '[ABC]', -1).alias('s')).collect()

In [56]:
df3.select(split(df3.s, '[ABC]', 2).alias('s')).collect()

[Row(s=['one', 'twoBthreeC'])]

In [57]:
df3.select(split(df3.s, '[ABC]', -1).alias('s')).collect()

[Row(s=['one', 'two', 'three', ''])]

#### array_contains(_)

In [59]:
from pyspark.sql.functions import array_contains

In [60]:
df4 = df1.withColumn("HasJava",array_contains(df1.skillsArray,"java"))
df4.show()

+---+----+------------+---------------+-------+
| id|name|      skills|    skillsArray|HasJava|
+---+----+------------+---------------+-------+
|  1| xyz|dotnet,azure|[dotnet, azure]|  false|
|  2| abc|    java,aws|    [java, aws]|   true|
+---+----+------------+---------------+-------+



In [61]:
spark.stop()