# When / Otherwise

- This requirement is similar to the last, but now you want to add multiple values based on the voter's position. Modify your `voter_df` DataFrame to add a random number to any voting member that is defined as a `Councilmember`. Use `2` for the `Mayor` and `0` for anything other position.

- The `voter_df` Data Frame is defined and available to you. The `pyspark.sql.functions` library is available as `F`. You can use `F.rand()` to generate the random value.

## Instructions

- Add a column to `voter_df` named `random_val` with the results of the `F.rand()` method for any voter with the title `Councilmember`. Set `random_val` to `2` for the `Mayor`. Set any other title to the value `0`.
- Show some of the Data Frame rows, noting whether the clauses worked.
- Use the `.filter` clause to find 0 in `random_val`.

In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [4]:
import pyspark.sql.functions as F

# Load the CSV file
voter_df = spark.read.format('csv').options(Header=True).load('file:///home/talentum/test-jupyter/P2/M1/SM15/Dataset/DallasCouncilVoters.csv')

# Add a column to voter_df for a voter based on their position
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE == 'Councilmember', 1)
                               .otherwise(2)
                               )

# Show some of the DataFrame rows
voter_df.show()

+----------+-------------+-------------------+----------+
|      DATE|        TITLE|         VOTER_NAME|random_val|
+----------+-------------+-------------------+----------+
|02/08/2017|Councilmember|  Jennifer S. Gates|         1|
|02/08/2017|Councilmember| Philip T. Kingston|         1|
|02/08/2017|        Mayor|Michael S. Rawlings|         2|
|02/08/2017|Councilmember|       Adam Medrano|         1|
|02/08/2017|Councilmember|       Casey Thomas|         1|
|02/08/2017|Councilmember|Carolyn King Arnold|         1|
|02/08/2017|Councilmember|       Scott Griggs|         1|
|02/08/2017|Councilmember|   B. Adam  McGough|         1|
|02/08/2017|Councilmember|       Lee Kleinman|         1|
|02/08/2017|Councilmember|      Sandy Greyson|         1|
|02/08/2017|Councilmember|  Jennifer S. Gates|         1|
|02/08/2017|Councilmember| Philip T. Kingston|         1|
|02/08/2017|        Mayor|Michael S. Rawlings|         2|
|02/08/2017|Councilmember|       Adam Medrano|         1|
|02/08/2017|Co

In [5]:
# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 1).show()

+----------+-------------+-------------------+----------+
|      DATE|        TITLE|         VOTER_NAME|random_val|
+----------+-------------+-------------------+----------+
|02/08/2017|Councilmember|  Jennifer S. Gates|         1|
|02/08/2017|Councilmember| Philip T. Kingston|         1|
|02/08/2017|Councilmember|       Adam Medrano|         1|
|02/08/2017|Councilmember|       Casey Thomas|         1|
|02/08/2017|Councilmember|Carolyn King Arnold|         1|
|02/08/2017|Councilmember|       Scott Griggs|         1|
|02/08/2017|Councilmember|   B. Adam  McGough|         1|
|02/08/2017|Councilmember|       Lee Kleinman|         1|
|02/08/2017|Councilmember|      Sandy Greyson|         1|
|02/08/2017|Councilmember|  Jennifer S. Gates|         1|
|02/08/2017|Councilmember| Philip T. Kingston|         1|
|02/08/2017|Councilmember|       Adam Medrano|         1|
|02/08/2017|Councilmember|       Casey Thomas|         1|
|02/08/2017|Councilmember|Carolyn King Arnold|         1|
|02/08/2017|Co

In [14]:
import pyspark.sql.functions as F

# Load the CSV file with correct options
voter_df = spark.read.format('csv').options(header=True).load('file:///home/talentum/test-jupyter/P2/M1/SM15/Dataset/DallasCouncilVoters.csv')

# Correcting the column name in options() from 'Header' to 'header'
# Add a column based on the voter's position
voter_df = voter_df.withColumn('random_val',
                               F.when(F.col('TITLE') == 'Councilmember', 1)
                               .otherwise(2)
                              )

# Splitting full name into first, second (middle), and last name
voter_df = voter_df.withColumn('LAST_NAME', F.split(F.col('VOTER_NAME'), ' ')[3])


voter_df = voter_df.withColumn('FIRST_NAME', F.split(F.col('VOTER_NAME'), ' ')[0])

# Handling middle name (if present)
voter_df = voter_df.withColumn('SECOND_NAME', 
                               F.when(F.size(F.split(F.col('VOTER_NAME'), ' ')) > 2,
                                      F.split(F.col('VOTER_NAME'), ' ')[1]
                                     ).otherwise(F.lit('None')))


# Show the DataFrame with separate columns and the added 'random_val' column
voter_df.show(truncate=False)

+----------+-------------+-------------------+----------+---------+----------+-----------+
|DATE      |TITLE        |VOTER_NAME         |random_val|LAST_NAME|FIRST_NAME|SECOND_NAME|
+----------+-------------+-------------------+----------+---------+----------+-----------+
|02/08/2017|Councilmember|Jennifer S. Gates  |1         |null     |Jennifer  |S.         |
|02/08/2017|Councilmember|Philip T. Kingston |1         |null     |Philip    |T.         |
|02/08/2017|Mayor        |Michael S. Rawlings|2         |null     |Michael   |S.         |
|02/08/2017|Councilmember|Adam Medrano       |1         |null     |Adam      |None       |
|02/08/2017|Councilmember|Casey Thomas       |1         |null     |Casey     |None       |
|02/08/2017|Councilmember|Carolyn King Arnold|1         |null     |Carolyn   |King       |
|02/08/2017|Councilmember|Scott Griggs       |1         |null     |Scott     |None       |
|02/08/2017|Councilmember|B. Adam  McGough   |1         |McGough  |B.        |Adam       |

In [15]:
# Correcting the column name in options() from 'Header' to 'header'
# Add a column based on the voter's position
voter_df = voter_df.withColumn('random_val',
                               F.when(F.col('TITLE') == 'Councilmember', 1)
                               .otherwise(2)
                              )

# Splitting full name into first, second (middle), and last name
voter_df = voter_df.withColumn('FIRST_NAME', F.split(F.col('VOTER_NAME'), ' ')[0])

# Handling middle name (if present)
voter_df = voter_df.withColumn('SECOND_NAME', 
                               F.when(F.size(F.split(F.col('VOTER_NAME'), ' ')) > 2,
                                      F.array_join(F.slice(F.split(F.col('VOTER_NAME'), ' '), 1, -1), ' ')
                                     ).otherwise(F.lit('None')))

# Handling last name
voter_df = voter_df.withColumn('LAST_NAME', 
                               F.array_join(F.slice(F.split(F.col('VOTER_NAME'), ' '), -1, 1), ' '))

# Show the DataFrame with separate columns and the added 'random_val' column
voter_df.show(truncate=False)

Py4JJavaError: An error occurred while calling o343.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 24.0 failed 1 times, most recent failure: Lost task 0.0 in stage 24.0 (TID 24, localhost, executor driver): java.lang.RuntimeException: Unexpected value for length in function slice: length must be greater than or equal to 0.
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.RuntimeException: Unexpected value for length in function slice: length must be greater than or equal to 0.
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
