In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
            .appName('Transform in Pyspark') \
            .getOrCreate()

# Prepare Data
simpleData = (("Java",4000,5), \
    ("Python", 4600,10),  \
    ("Scala", 4100,15),   \
    ("Scala", 4500,15),   \
    ("PHP", 3000,20),  \
  )
columns= ["CourseName", "fee", "discount"]

# Create DataFrame
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()

root
 |-- CourseName: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- discount: long (nullable = true)



#### Transform

In [0]:
#Custom_function_1
from pyspark.sql.functions import upper
def to_upper_str_columns(df):
    return df.withColumn("Course_name",upper(df.CourseName))
#Custom_function_2
def reduce_price(df,reduceBy):
    return df.withColumn("new_fee",df.fee-reduceBy)

#custom_function_3
def apply_discount(df):
    return df.withColumn("discount_fee",df.new_fee-(df.new_fee*df.discount)/100)


In [0]:
df2=df.transform(to_upper_str_columns)\
    .transform(reduce_price,1000)\
        .transform(apply_discount)

In [0]:
df2.show()

+----------+----+--------+-----------+-------+------------+
|CourseName| fee|discount|Course_name|new_fee|discount_fee|
+----------+----+--------+-----------+-------+------------+
|      Java|4000|       5|       JAVA|   3000|      2850.0|
|    Python|4600|      10|     PYTHON|   3600|      3240.0|
|     Scala|4100|      15|      SCALA|   3100|      2635.0|
|     Scala|4500|      15|      SCALA|   3500|      2975.0|
|       PHP|3000|      20|        PHP|   2000|      1600.0|
+----------+----+--------+-----------+-------+------------+



In [0]:
def sel_col(df):
    return df.select(df2.columns)
df3=df2.transform(sel_col)
df3.show()

+----------+----+--------+-----------+-------+------------+
|CourseName| fee|discount|Course_name|new_fee|discount_fee|
+----------+----+--------+-----------+-------+------------+
|      Java|4000|       5|       JAVA|   3000|      2850.0|
|    Python|4600|      10|     PYTHON|   3600|      3240.0|
|     Scala|4100|      15|      SCALA|   3100|      2635.0|
|     Scala|4500|      15|      SCALA|   3500|      2975.0|
|       PHP|3000|      20|        PHP|   2000|      1600.0|
+----------+----+--------+-----------+-------+------------+



#### Array Type

In [0]:
# Create DataFrame with Array
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"]),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"]),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"])
]
df = spark.createDataFrame(data=data,schema=["Name","Languages1","Languages2"])
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Languages1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Languages2: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
from pyspark.sql.functions import upper
from pyspark.sql.functions import transform
df.select(transform("Languages1",lambda x: upper(x)).alias("languages1")).show()


+------------------+
|        languages1|
+------------------+
|[JAVA, SCALA, C++]|
|[SPARK, JAVA, C++]|
|      [CSHARP, VB]|
+------------------+



#### Apply()

In [0]:
columns=['no','name']
data=[('1','john jones'),
      ('2','tracey smith'),
      ('3','amy sanders')]
df=spark.createDataFrame(data=data,schema=columns)
df.show()

+---+------------+
| no|        name|
+---+------------+
|  1|  john jones|
|  2|tracey smith|
|  3| amy sanders|
+---+------------+



##### Applying Function using withColumn()

In [0]:
from pyspark.sql.functions import upper
df.withColumn("upper_name",upper(df.name)).show()

+---+------------+------------+
| no|        name|  upper_name|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



##### Applying with select

In [0]:
df.select("no","name",upper(df.name)).show()

+---+------------+------------+
| no|        name| upper(name)|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



##### With Spark Sql

In [0]:
df.createOrReplaceTempView("tem")
df_sql=spark.sql("""
                 select no,name,upper(name) from tem
                 """)
df_sql.show()

+---+------------+------------+
| no|        name| upper(name)|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



##### Create a custom function

In [0]:
def upper_case(str):
    return str.upper()

In [0]:
# Convert function to UDF
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType
upperCaseUDF=udf(lambda x:upper_case(x),StringType())

In [0]:
# Custom UDF with select
df.select("no","name",upperCaseUDF(col("name")).alias("naameUpper")).show()

+---+------------+------------+
| no|        name|  naameUpper|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



In [0]:
# custom udf withColumn
df.withColumn("upper",upperCaseUDF(col("name"))).show()

+---+------------+------------+
| no|        name|       upper|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



In [0]:
#Custom function with spark sql
spark.udf.register("upperCaseUDF",upperCaseUDF)
df.createOrReplaceTempView('udfTable')
spark.sql("""
          SELECT no, name, upperCaseUDF(Name) as upperCase from udfTable
          """).show()

+---+------------+------------+
| no|        name|   upperCase|
+---+------------+------------+
|  1|  john jones|  JOHN JONES|
|  2|tracey smith|TRACEY SMITH|
|  3| amy sanders| AMY SANDERS|
+---+------------+------------+



#### Pyspark pandas Apply

In [0]:
import pyspark.pandas as pd
import numpy as np

technologies = ({
    'Fee' : [20000, 40000, 25000, 22000, np.NaN],
    'Discount' : [1000, 2500, 1500, 1200, 3000]
})

psdf = pd.DataFrame(technologies)
print(psdf)

       Fee  Discount
0  20000.0      1000
1  40000.0      2500
2  25000.0      1500
3  22000.0      1200
4      NaN      3000


In [0]:
def add(data):
    return data[0]+data[1]

In [0]:
addDF=psdf.apply(add,axis=1)

In [0]:
print(addDF)

0    21000.0
1    42500.0
2    26500.0
3    23200.0
4        NaN
dtype: float64
