In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_2/data_day_4'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_2/data_day_4


In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sb
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [58]:
df = spark.read.csv("DallasCouncilVoters.csv", header=True, inferSchema=True)
df.show(5)

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|
+----------+-------------+-------------------+
only showing top 5 rows



In [59]:
df.count()

44625

In [60]:
df.printSchema()

root
 |-- DATE: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



In [61]:
df.orderBy(*[df.DATE.desc()]).show(5)

+-----------+-----+----------+
|       DATE|TITLE|VOTER_NAME|
+-----------+-----+----------+
|[VICE CHAIR| null|      null|
|[VICE CHAIR| null|      null|
|[VICE CHAIR| null|      null|
|[VICE CHAIR| null|      null|
|[VICE CHAIR| null|      null|
+-----------+-----+----------+
only showing top 5 rows



In [65]:
def convert_datetime(x):
    try:
        return datetime.strptime(x, '%m/%d/%Y')
    except Exception:
        return None
# func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType())
func = udf(lambda x: convert_datetime(x), DateType())
df = df.withColumn('DATE', func(df.DATE))
df.orderBy(*[df.DATE.desc()]).show(5)

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|2018-11-20|Councilmember|      Scott  Griggs|
|2018-11-20|Councilmember|Philip T.  Kingston|
|2018-11-20|Councilmember|   B. Adam  McGough|
|2018-11-20|Councilmember|    Lee M. Kleinman|
|2018-11-20|Councilmember|     Sandy  Greyson|
+----------+-------------+-------------------+
only showing top 5 rows



In [66]:
df.printSchema()

root
 |-- DATE: date (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



In [67]:
print('Number of Null values')
for col_name in df.columns:
    print('\t -', col_name, ": ", df.where(df[col_name].isNull()).count())

Number of Null values
	 - DATE :  405
	 - TITLE :  195
	 - VOTER_NAME :  503


In [68]:
df = df.dropna(subset='VOTER_NAME')
df = df.dropna(subset='DATE')
df.count()

43912

In [69]:
print('Number of Null values')
for col in df.columns:
    print('\t -', col, ": ", df.where(df[col].isNull()).count())

Number of Null values
	 - DATE :  0
	 - TITLE :  0
	 - VOTER_NAME :  0


In [70]:
num_dup_rows = df.count() - df.distinct().count()
num_dup_rows

42653

In [71]:
df = df.drop_duplicates()
df.count()

1259

In [72]:
df[['VOTER_NAME']].distinct().show(10)

+-------------------+
|         VOTER_NAME|
+-------------------+
|     Tennell Atkins|
|       Scott Griggs|
|      Scott  Griggs|
|      Sandy Greyson|
|Michael S. Rawlings|
|       Kevin Felder|
|       Adam Medrano|
|      Casey  Thomas|
|      Mark  Clayton|
|       Casey Thomas|
+-------------------+
only showing top 10 rows



In [73]:
df = df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) <20')
df = df.filter(~df.VOTER_NAME.contains('_'))
df.count()

1259

In [74]:
df = df.withColumn('splits', split(df.VOTER_NAME, '\s+'))
df = df.withColumn('first_name', df.splits.getItem(0))
df = df.withColumn('last_name', df.splits.getItem(size('splits') - 1))
df.show(5)

+----------+--------------------+-------------------+--------------------+----------+---------+
|      DATE|               TITLE|         VOTER_NAME|              splits|first_name|last_name|
+----------+--------------------+-------------------+--------------------+----------+---------+
|2017-02-08|       Councilmember|       Scott Griggs|     [Scott, Griggs]|     Scott|   Griggs|
|2017-12-06|               Mayor|Michael S. Rawlings|[Michael, S., Raw...|   Michael| Rawlings|
|2017-01-18|       Councilmember|       Mark Clayton|     [Mark, Clayton]|      Mark|  Clayton|
|2016-10-26|Deputy Mayor Pro Tem|        Erik Wilson|      [Erik, Wilson]|      Erik|   Wilson|
|2018-05-09|       Councilmember|      Scott  Griggs|     [Scott, Griggs]|     Scott|   Griggs|
+----------+--------------------+-------------------+--------------------+----------+---------+
only showing top 5 rows



In [50]:
# df = df.drop('splits')

In [75]:
df = df.withColumn('random_val', when(df.TITLE == 'Councilmember', rand())
                                .when(df.TITLE == 'Mayor', 2)
                                .otherwise(0)
                                )
df.show(3)

+----------+-------------+-------------------+--------------------+----------+---------+------------------+
|      DATE|        TITLE|         VOTER_NAME|              splits|first_name|last_name|        random_val|
+----------+-------------+-------------------+--------------------+----------+---------+------------------+
|2017-02-08|Councilmember|       Scott Griggs|     [Scott, Griggs]|     Scott|   Griggs|0.3674881883233443|
|2017-12-06|        Mayor|Michael S. Rawlings|[Michael, S., Raw...|   Michael| Rawlings|               2.0|
|2017-01-18|Councilmember|       Mark Clayton|     [Mark, Clayton]|      Mark|  Clayton|0.9857938909892774|
+----------+-------------+-------------------+--------------------+----------+---------+------------------+
only showing top 3 rows



In [76]:
df.where(df.random_val == 0).show(5)

+----------+--------------------+------------+---------------+----------+---------+----------+
|      DATE|               TITLE|  VOTER_NAME|         splits|first_name|last_name|random_val|
+----------+--------------------+------------+---------------+----------+---------+----------+
|2016-10-26|Deputy Mayor Pro Tem| Erik Wilson| [Erik, Wilson]|      Erik|   Wilson|       0.0|
|2018-02-14|Deputy Mayor Pro Tem|Adam Medrano|[Adam, Medrano]|      Adam|  Medrano|       0.0|
|2018-03-21|Deputy Mayor Pro Tem|Adam Medrano|[Adam, Medrano]|      Adam|  Medrano|       0.0|
|2018-11-14|Deputy Mayor Pro Tem|Adam Medrano|[Adam, Medrano]|      Adam|  Medrano|       0.0|
|2018-10-17|Deputy Mayor Pro Tem|Adam Medrano|[Adam, Medrano]|      Adam|  Medrano|       0.0|
+----------+--------------------+------------+---------------+----------+---------+----------+
only showing top 5 rows



In [77]:
def getFirstAndMiddle(names):
    return ' '.join(names[:-1])

func_get_first_middle = udf(getFirstAndMiddle, StringType())

df = df.withColumn('first_and_middle_name', func_get_first_middle(df.splits))
df = df.drop('splits', 'first_name')
df.show(5)

+----------+--------------------+-------------------+---------+------------------+---------------------+
|      DATE|               TITLE|         VOTER_NAME|last_name|        random_val|first_and_middle_name|
+----------+--------------------+-------------------+---------+------------------+---------------------+
|2017-02-08|       Councilmember|       Scott Griggs|   Griggs|0.3674881883233443|                Scott|
|2017-12-06|               Mayor|Michael S. Rawlings| Rawlings|               2.0|           Michael S.|
|2017-01-18|       Councilmember|       Mark Clayton|  Clayton|0.9857938909892774|                 Mark|
|2016-10-26|Deputy Mayor Pro Tem|        Erik Wilson|   Wilson|               0.0|                 Erik|
|2018-05-09|       Councilmember|      Scott  Griggs|   Griggs|0.6599395082952412|                Scott|
+----------+--------------------+-------------------+---------+------------------+---------------------+
only showing top 5 rows



In [78]:
df = df.sort(df.DATE.asc())
df = df.withColumn('ROW_ID', monotonically_increasing_id())
df.show(5)

+----------+--------------------+------------------+---------+-------------------+---------------------+------+
|      DATE|               TITLE|        VOTER_NAME|last_name|         random_val|first_and_middle_name|ROW_ID|
+----------+--------------------+------------------+---------+-------------------+---------------------+------+
|2016-09-07|       Councilmember|Philip T. Kingston| Kingston|0.42551775695177896|            Philip T.|     0|
|2016-09-07|       Councilmember|      Casey Thomas|   Thomas| 0.8253233380321066|                Casey|     1|
|2016-09-07|Deputy Mayor Pro Tem|       Erik Wilson|   Wilson|                0.0|                 Erik|     2|
|2016-09-07|       Councilmember|      Mark Clayton|  Clayton|0.18587961872839343|                 Mark|     3|
|2016-09-07|       Mayor Pro Tem|  Monica R. Alonzo|   Alonzo|                0.0|            Monica R.|     4|
+----------+--------------------+------------------+---------+-------------------+---------------------+

In [81]:
df.rdd.getNumPartitions()

85