In [15]:
# Localhost
# http://localhost:4040/ 

# Packages
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import os
import sys
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, col
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import rank
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.types import StructType,StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
from pyspark.sql.functions import regexp_replace


spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


# when()
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]
columns = ["name","gender","salary"]
df_when = spark.createDataFrame(data = data, schema = columns)
df_when.show()


# split()
data=data = [('James','','Smith','1991-04-01'),
  ('Michael','Rose','','2000-05-19'),
  ('Robert','','Williams','1978-09-05'),
  ('Maria','Anne','Jones','1967-12-01'),
  ('Jen','Mary','Brown','1980-02-17')]
columns=["firstname","middlename","lastname","dob"]
df_split = spark.createDataFrame(data,columns)
df_split.show()


# df_filter
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M","Male"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F","Female"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F","Male"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M","Female"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M","Male"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M","Female")
 ]
        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True),
     StructField('gender2', StringType(), True)
 ])

df_filtering = spark.createDataFrame(data = data, schema = schema)
df_filtering.show()




+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+

+---------+----------+--------+----------+
|firstname|middlename|lastname|       dob|
+---------+----------+--------+----------+
|    James|          |   Smith|1991-04-01|
|  Michael|      Rose|        |2000-05-19|
|   Robert|          |Williams|1978-09-05|
|    Maria|      Anne|   Jones|1967-12-01|
|      Jen|      Mary|   Brown|1980-02-17|
+---------+----------+--------+----------+

+--------------------+------------------+-----+------+-------+
|                name|         languages|state|gender|gender2|
+--------------------+------------------+-----+------+-------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|   Male|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F| Female|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|   Male|
|{Maria, Anne, Jo

In [17]:
######################################################
# By Yourself
######################################################
TRANSFORMS = []

DATASETS = [df_when, df_split, df_filtering]

def cleaning(df):
    if df==df_when:
        df = df.drop("salary")
        df = df.withColumn("name", F.upper(col("name")))
    elif df==df_split:
        df=df.drop("dob")
        df=df.withColumn("lastname", F.upper(col("lastname")))
    elif df==df_filtering:
        df=df.drop("gender2") 
    return df

for i in DATASETS:
    df = cleaning(i)
    TRANSFORMS.append(df)


TRANSFORMS[0].show()

+-------+------+
|   name|gender|
+-------+------+
|  JAMES|     M|
|MICHAEL|     M|
| ROBERT|  null|
|  MARIA|     F|
|    JEN|      |
+-------+------+



In [20]:
######################################################
# From Stackexchange (WORKING)
######################################################
TRANSFORMS = []

DATASETS = {
    "ONE"   : df_when,
    "TWO"   : df_split,
    "THREE" : df_filtering,
    }

def multi_output(Input_table, table_name):
        output_table = Input_table
        if table_name=="ONE":
            output_table = Input_table.drop("name")
        
        elif table_name=="TWO":
            output_table= Input_table.drop("gender")
        
        elif table_name=="THREE":
            output_table = Input_table.drop("salary")
        
        return output_table
      
for table_name, table_location in list(DATASETS.items()):
    TRANSFORMS.append(multi_output(table_location,table_name))

len(TRANSFORMS)  
TRANSFORMS[0].show()
TRANSFORMS[1].show()
TRANSFORMS[2].show()

+------+------+
|gender|salary|
+------+------+
|     M| 60000|
|     M| 70000|
|  null|400000|
|     F|500000|
|      |  null|
+------+------+

+---------+----------+--------+----------+
|firstname|middlename|lastname|       dob|
+---------+----------+--------+----------+
|    James|          |   Smith|1991-04-01|
|  Michael|      Rose|        |2000-05-19|
|   Robert|          |Williams|1978-09-05|
|    Maria|      Anne|   Jones|1967-12-01|
|      Jen|      Mary|   Brown|1980-02-17|
+---------+----------+--------+----------+

+--------------------+------------------+-----+------+-------+
|                name|         languages|state|gender|gender2|
+--------------------+------------------+-----+------+-------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|   Male|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F| Female|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|   Male|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M| Female|
|  {Jen, Mary, Brown}|    