In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

employee_data = [
    (10, "Michael Robinson", "1999-06-01", "100", 2000),
    (20, "James Wood", "2003-03-01", "200", 8000),
    (30, "Chris Andrews", "2005-04-01", "100", 6000),
    (40, "Mark Bond", "2008-10-01", "100", 7000),
    (50, "Steve Watson", "1996-02-01", "400", 1000),
    (60, "Mathews Simon", "1998-11-01", "500", 5000),
    (70, "Peter Paul", "2011-04-01", "600", 5000)
]

employee_schema = ["employee_id", "Name", "doj", "employee_dept_id", "salary"]

empDF = spark.createDataFrame(data=employee_data, schema=employee_schema) 

empDF.show()

+-----------+----------------+----------+----------------+------+
|employee_id|            Name|       doj|employee_dept_id|salary|
+-----------+----------------+----------+----------------+------+
|         10|Michael Robinson|1999-06-01|             100|  2000|
|         20|      James Wood|2003-03-01|             200|  8000|
|         30|   Chris Andrews|2005-04-01|             100|  6000|
|         40|       Mark Bond|2008-10-01|             100|  7000|
|         50|    Steve Watson|1996-02-01|             400|  1000|
|         60|   Mathews Simon|1998-11-01|             500|  5000|
|         70|      Peter Paul|2011-04-01|             600|  5000|
+-----------+----------------+----------+----------------+------+



## Define UDF to Rename columns

In [6]:
import pyspark.sql.functions as f

def rename_columns(rename_df):
    for column in rename_df.columns:
        new_column = "Col_"+column
        rename_df = rename_df.withColumnRenamed(column, new_column)
    return rename_df

## Execute UDF

In [7]:
rename_df = rename_columns(empDF)
rename_df.show()

+---------------+----------------+----------+--------------------+----------+
|Col_employee_id|        Col_Name|   Col_doj|Col_employee_dept_id|Col_salary|
+---------------+----------------+----------+--------------------+----------+
|             10|Michael Robinson|1999-06-01|                 100|      2000|
|             20|      James Wood|2003-03-01|                 200|      8000|
|             30|   Chris Andrews|2005-04-01|                 100|      6000|
|             40|       Mark Bond|2008-10-01|                 100|      7000|
|             50|    Steve Watson|1996-02-01|                 400|      1000|
|             60|   Mathews Simon|1998-11-01|                 500|      5000|
|             70|      Peter Paul|2011-04-01|                 600|      5000|
+---------------+----------------+----------+--------------------+----------+



## UDF to convert name into upper case

In [8]:
from pyspark.sql.functions import col,upper

def upper_case(df):
    em_up = df.withColumn("Name", upper(df.Name))
    return em_up

In [9]:
upper_df = upper_case(empDF)
upper_df.show()

+-----------+----------------+----------+----------------+------+
|employee_id|            Name|       doj|employee_dept_id|salary|
+-----------+----------------+----------+----------------+------+
|         10|MICHAEL ROBINSON|1999-06-01|             100|  2000|
|         20|      JAMES WOOD|2003-03-01|             200|  8000|
|         30|   CHRIS ANDREWS|2005-04-01|             100|  6000|
|         40|       MARK BOND|2008-10-01|             100|  7000|
|         50|    STEVE WATSON|1996-02-01|             400|  1000|
|         60|   MATHEWS SIMON|1998-11-01|             500|  5000|
|         70|      PETER PAUL|2011-04-01|             600|  5000|
+-----------+----------------+----------+----------------+------+

