In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("built-in")\
        .getOrCreate()

### Creating data frame

In [0]:

data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
df.show()

+--------------------+--------+------+------+
|                name|dob_year|gender|salary|
+--------------------+--------+------+------+
|     James, A, Smith|    2018|     M|  3000|
|Michael, Rose, Jones|    2010|     M|  4000|
|   Robert,K,Williams|    2010|     M|  4000|
|    Maria,Anne,Jones|    2005|     F|  4000|
|      Jen,Mary,Brown|    2010|      |    -1|
+--------------------+--------+------+------+



#### Converting String to Array

In [0]:
from pyspark.sql.functions import split
# split used to convert any string into array/list
df.select(split(df.name,",").alias("full_name"))\
    .drop("name").show(truncate=False) 

+------------------------+
|full_name               |
+------------------------+
|[James,  A,  Smith]     |
|[Michael,  Rose,  Jones]|
|[Robert, K, Williams]   |
|[Maria, Anne, Jones]    |
|[Jen, Mary, Brown]      |
+------------------------+



#### converting String to Array(Using sql)

In [0]:
df.createOrReplaceTempView("employee")
spark.sql("""
         select name,SPLIT(name,",") as full_name from employee
         """).show()

+--------------------+--------------------+
|                name|           full_name|
+--------------------+--------------------+
|     James, A, Smith| [James,  A,  Smith]|
|Michael, Rose, Jones|[Michael,  Rose, ...|
|   Robert,K,Williams|[Robert, K, Willi...|
|    Maria,Anne,Jones|[Maria, Anne, Jones]|
|      Jen,Mary,Brown|  [Jen, Mary, Brown]|
+--------------------+--------------------+



#### concat_ws() function

In [0]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), 
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|            name| languagesAtSchool|currentState|
+----------------+------------------+------------+
|    James,,Smith|[Java, Scala, C++]|          CA|
|   Michael,Rose,|[Spark, Java, C++]|          NJ|
|Robert,,Williams|      [CSharp, VB]|          NV|
+----------------+------------------+------------+



#### Combining array elements

In [0]:
from pyspark.sql.functions import concat_ws
df.select(df.languagesAtSchool)\
    .withColumn("new_col",concat_ws(";",df.languagesAtSchool)).show()

+------------------+--------------+
| languagesAtSchool|       new_col|
+------------------+--------------+
|[Java, Scala, C++]|Java;Scala;C++|
|[Spark, Java, C++]|Spark;Java;C++|
|      [CSharp, VB]|     CSharp;VB|
+------------------+--------------+



#### combining array elements(using sql)

In [0]:
df.createOrReplaceTempView("school")
spark.sql("""
          select languagesAtSchool,concat_ws(";",languagesAtSchool) as new_col from school
          """).show()

+------------------+--------------+
| languagesAtSchool|       new_col|
+------------------+--------------+
|[Java, Scala, C++]|Java;Scala;C++|
|[Spark, Java, C++]|Spark;Java;C++|
|      [CSharp, VB]|     CSharp;VB|
+------------------+--------------+



#### substring() function

In [0]:
data=[(1,"20200828"),(2,"20180525")]
columns=['id','calender']
df=spark.createDataFrame(data,columns)
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- calender: string (nullable = true)



In [0]:
df.show()

+---+--------+
| id|calender|
+---+--------+
|  1|20200828|
|  2|20180525|
+---+--------+



#### withColumn using substring

In [0]:
from pyspark.sql.functions import substring
df.withColumn("Year",substring("calender",1,4))\
    .withColumn("Month",substring("calender",5,2))\
        .withColumn("date",substring("calender",7,2))\
            .show()

+---+--------+----+-----+----+
| id|calender|Year|Month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



#### select using substring

In [0]:
df.select("calender",substring(df.calender,1,4).alias("Year"),
          substring(df.calender,5,2).alias("Month"),
          substring(df.calender,7,2).alias("Date"))\
    .show()

+--------+----+-----+----+
|calender|Year|Month|Date|
+--------+----+-----+----+
|20200828|2020|   08|  28|
|20180525|2018|   05|  25|
+--------+----+-----+----+



#### with selectExpr()

In [0]:
df.selectExpr('calender','substring(calender,1,4) as year',
              'substring(calender,5,2) as month',
              'substring(calender,7,2) as date')\
                  .show()

+--------+----+-----+----+
|calender|year|month|date|
+--------+----+-----+----+
|20200828|2020|   08|  28|
|20180525|2018|   05|  25|
+--------+----+-----+----+



#### substr() from column

In [0]:

df.withColumn("year", df.calender.substr(1, 4))\
    .withColumn("month", df.calender.substr(5, 2))\
    .withColumn("date", df.calender.substr(7, 2))\
    .show()

+---+--------+----+-----+----+
| id|calender|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



#### Using sql

In [0]:
df.createOrReplaceTempView("details")
spark.sql("""
          SELECT id,calender,
          substring(calender,1,4) as year,
          substring(calender,5,2) as month,
          substring(calender,7,2) as date from details
          """).show()

+---+--------+----+-----+----+
| id|calender|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



In [0]:
select_statement='Select id,calender, substring(calender,1,4) as year, substring(calender,5,2) as month, substring(calender,7,2) as date from details'
spark.sql(f"{select_statement}").show()

+---+--------+----+-----+----+
| id|calender|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



In [0]:

address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df=spark.createDataFrame(address,['id','address','state'])
df.show()

+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



#### Replace String with columns

In [0]:
from pyspark.sql.functions import regexp_replace
df.withColumn("new_addr",regexp_replace('address','Rd','Road'))\
    .drop("address").show()

+---+-----+------------------+
| id|state|          new_addr|
+---+-----+------------------+
|  1|   DE|14851 Jeffrey Road|
|  2|   NY|43421 Margarita St|
|  3|   CA|  13111 Siemon Ave|
+---+-----+------------------+



#### Replce Column values conditionally

In [0]:
from pyspark.sql.functions import when
df.withColumn('address',when(df.address.endswith("Rd"),regexp_replace(df.address,'Rd','Road'))\
    .when(df.address.endswith("St"),regexp_replace(df.address,'St','Street'))\
             .when(df.address.endswith("Ave"), regexp_replace(df.address, 'Ave', 'Avenue'))\
            .otherwise(df.address))\
            .show()

+---+--------------------+-----+
| id|             address|state|
+---+--------------------+-----+
|  1|  14851 Jeffrey Road|   DE|
|  2|43421 Margarita S...|   NY|
|  3| 13111 Siemon Avenue|   CA|
+---+--------------------+-----+



#### Replace column with other column value

In [0]:
df3 = spark.createDataFrame(
   [("ABCDE_XYZ", "XYZ","FGH")], 
    ("col1", "col2","col3")
  )

In [0]:
from pyspark.sql.functions import expr
# col1 is the column that has to be replaced with wherever the values from the col2 should be replaced with col3
df3.withColumn("new_col",
              expr("regexp_replace(col1, col2, col3)")).show()

+---------+----+----+---------+
|     col1|col2|col3|  new_col|
+---------+----+----+---------+
|ABCDE_XYZ| XYZ| FGH|ABCDE_FGH|
+---------+----+----+---------+



#### Replace column values with dictionry values

In [0]:
stateDict = {"DE":"Delaware", "NY":"NewYork", "CA":"California"}
df2 = df.rdd.map(lambda x :
                (x.id, x.address, stateDict[x.state])
                ).toDF(["id", "address", "state"])
df2.show()

+---+------------------+----------+
| id|           address|     state|
+---+------------------+----------+
|  1|  14851 Jeffrey Rd|  Delaware|
|  2|43421 Margarita St|   NewYork|
|  3|  13111 Siemon Ave|California|
+---+------------------+----------+



#### translate() function

In [0]:

from pyspark.sql.functions import translate
df.withColumn("new_add", translate('address','123','ABC')).show()

+---+------------------+-----+------------------+
| id|           address|state|           new_add|
+---+------------------+-----+------------------+
|  1|  14851 Jeffrey Rd|   DE|  A485A Jeffrey Rd|
|  2|43421 Margarita St|   NY|4C4BA Margarita St|
|  3|  13111 Siemon Ave|   CA|  ACAAA Siemon Ave|
+---+------------------+-----+------------------+



#### overlay() function

In [0]:

df4 = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df4.printSchema()

root
 |-- col1: string (nullable = true)
 |-- col2: string (nullable = true)



In [0]:

from pyspark.sql.functions import overlay
df4.withColumn("new_overlay", overlay("col1", "col2", 9)).show()

+---------+----+-----------+
|     col1|col2|new_overlay|
+---------+----+-----------+
|ABCDE_XYZ| FGH|ABCDE_XYFGH|
+---------+----+-----------+

