In [8]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('SparkHelloWorld').getOrCreate()

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

data = [
    ("James", "", "Smith", "36636", "M", 3000),
    ("Michael", "Rose", "", "40288", "M", -1),
    ("Robert", "", "Williams", "42114", "M", 4000),
    ("Maria", "Anne", "Jones", "39192", "F", 4000),
    ("Jen", "Mary", "Brown", None, "F", 3000)

]

schema = StructType([
    StructField("first name", StringType(), True), \
    StructField("middle name", StringType(), True), \
    StructField("last name", StringType(), True), \
    StructField("id", StringType(), True), \
    StructField("sex", StringType(), True), \
    StructField("salary", IntegerType(), True) \
])

df = spark.createDataFrame(data=data, schema=schema)
df.show()

+----------+-----------+---------+-----+---+------+
|first name|middle name|last name|   id|sex|salary|
+----------+-----------+---------+-----+---+------+
|     James|           |    Smith|36636|  M|  3000|
|   Michael|       Rose|         |40288|  M|    -1|
|    Robert|           | Williams|42114|  M|  4000|
|     Maria|       Anne|    Jones|39192|  F|  4000|
|       Jen|       Mary|    Brown| null|  F|  3000|
+----------+-----------+---------+-----+---+------+



In [10]:
from pyspark.sql.functions import col

df_filtered = df.where(
    (col("id").isNotNull()) & (col("salary") > 0)
)

df_filtered.show()

+----------+-----------+---------+-----+---+------+
|first name|middle name|last name|   id|sex|salary|
+----------+-----------+---------+-----+---+------+
|     James|           |    Smith|36636|  M|  3000|
|    Robert|           | Williams|42114|  M|  4000|
|     Maria|       Anne|    Jones|39192|  F|  4000|
+----------+-----------+---------+-----+---+------+



In [13]:
from pyspark.sql.functions import concat, lit

df_full_name = df_filtered.withColumn(
    "full name",
    concat(col("first name"), lit(" "), col("middle name"), lit(" "), col("last name"))
)

df_full_name.show()

+----------+-----------+---------+-----+---+------+----------------+
|first name|middle name|last name|   id|sex|salary|       full name|
+----------+-----------+---------+-----+---+------+----------------+
|     James|           |    Smith|36636|  M|  3000|    James  Smith|
|    Robert|           | Williams|42114|  M|  4000|Robert  Williams|
|     Maria|       Anne|    Jones|39192|  F|  4000|Maria Anne Jones|
+----------+-----------+---------+-----+---+------+----------------+



In [32]:
df_filtered.createOrReplaceTempView("df")

df_full_name3 = spark.sql("""
    SELECT *,
    CASE
        WHEN `middle name` = '' THEN concat(`first name`, " ", `last name`)
        ELSE concat(`first name`, " ", `middle name`, " ", `last name`)
    END AS `full name`
    FROM df
""")
df_full_name3.show()

+----------+-----------+---------+-----+---+------+----------------+
|first name|middle name|last name|   id|sex|salary|       full name|
+----------+-----------+---------+-----+---+------+----------------+
|     James|           |    Smith|36636|  M|  3000|     James Smith|
|    Robert|           | Williams|42114|  M|  4000| Robert Williams|
|     Maria|       Anne|    Jones|39192|  F|  4000|Maria Anne Jones|
+----------+-----------+---------+-----+---+------+----------------+



In [40]:
print = hello world =]

SyntaxError: unmatched ']' (2026580970.py, line 1)