# acquisition

In [1]:
import pyspark
from pyspark.sql.types import *

import pandas as pd
from pyspark.sql import SparkSession

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
spark.read.csv('./sa311/source.csv')

DataFrame[_c0: string, _c1: string]

In [4]:
spark.read.format('csv').load('./sa311/source.csv')

DataFrame[_c0: string, _c1: string]

or

In [5]:
(spark.read
.option('header', True)
.option('inferSchema',True)
.format('csv')
.load('./sa311/source.csv'))

DataFrame[source_id: string, source_username: string]

inferSchema above means have Spark guess what the data type is

In [6]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField('source_id', StringType()),
    StructField('source_username', StringType())
])

schema

StructType(List(StructField(source_id,StringType,true),StructField(source_username,StringType,true)))

In [7]:
df = (spark.read
.option('schema', schema)
.option('header', True)
.csv('./sa311/source.csv'))

In [9]:
#df.write.csv('sa311_source.csv')

In [10]:
spark.read.csv('./sa311/sources_partitioned_data').show()

+------+--------------------+
|   _c0|                 _c1|
+------+--------------------+
|100137|    Merlene Blodgett|
|103582|         Carmen Cura|
|106463|     Richard Sanchez|
|119403|      Betty De Hoyos|
|119555|      Socorro Quiara|
|119868| Michelle San Miguel|
|120752|      Eva T. Kleiber|
|124405|           Lori Lara|
|132408|       Leonard Silva|
|135723|        Amy Cardenas|
|136202|    Michelle Urrutia|
|136979|      Leticia Garcia|
|137943|    Pamela K. Baccus|
|138605|        Marisa Ozuna|
|138650|      Kimberly Green|
|138650|Kimberly Green-Woods|
|138793| Guadalupe Rodriguez|
|138810|       Tawona Martin|
|139342|     Jessica Mendoza|
|139344|        Isis Mendoza|
+------+--------------------+
only showing top 20 rows



In [11]:
df.createOrReplaceTempView('sources')

In [12]:
spark.sql('SELECT * FROM sources')

DataFrame[source_id: string, source_username: string]

Looking at the data:

In [13]:
df.select(df.source_username).show()

+--------------------+
|     source_username|
+--------------------+
|    Merlene Blodgett|
|         Carmen Cura|
|     Richard Sanchez|
|      Betty De Hoyos|
|      Socorro Quiara|
| Michelle San Miguel|
|      Eva T. Kleiber|
|           Lori Lara|
|       Leonard Silva|
|        Amy Cardenas|
|    Michelle Urrutia|
|      Leticia Garcia|
|    Pamela K. Baccus|
|        Marisa Ozuna|
|      Kimberly Green|
|Kimberly Green-Woods|
| Guadalupe Rodriguez|
|       Tawona Martin|
|     Jessica Mendoza|
|        Isis Mendoza|
+--------------------+
only showing top 20 rows



In [14]:
spark.read.csv('./sa311/case.csv')

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]

In [15]:
spark.read.csv('./sa311/case.csv', header=True)

DataFrame[case_id: string, case_opened_date: string, case_closed_date: string, SLA_due_date: string, case_late: string, num_days_late: string, case_closed: string, dept_division: string, service_request_type: string, SLA_days: string, case_status: string, source_id: string, request_address: string, council_district: string]

In [16]:
df = spark.read.csv('./sa311/case.csv', header=True).select('case_late','num_days_late')

In [17]:
df.show()

+---------+-------------------+
|case_late|      num_days_late|
+---------+-------------------+
|       NO| -998.5087616000001|
|       NO|-2.0126041669999997|
|       NO|       -3.022337963|
|       NO|       -15.01148148|
|      YES|0.37216435200000003|
|       NO|       -29.74398148|
|       NO|       -14.70673611|
|       NO|       -14.70662037|
|       NO|       -14.70662037|
|       NO|       -14.70649306|
|       NO|       -14.70649306|
|       NO|       -14.70636574|
|       NO|          -14.70625|
|       NO|       -14.70636574|
|       NO|       -14.70623843|
|       NO|-14.705891199999998|
|       NO|       -14.70600694|
|       NO|       -14.70576389|
|       NO|       -14.70576389|
|       NO|       -14.70564815|
+---------+-------------------+
only showing top 20 rows



## Summarizing data:

In [18]:
df.dtypes

[('case_late', 'string'), ('num_days_late', 'string')]

".shape()" does not work here

In [19]:
df.count()

841704

In [20]:
shape = (df.count(), len(df.columns))
print(shape)

(841704, 2)


for looking at numeric variables or continuous variables

In [21]:
df.select(df.num_days_late.cast('float')).describe().show()

+-------+------------------+
|summary|     num_days_late|
+-------+------------------+
|  count|            841671|
|   mean|-49.07486758743872|
| stddev|176.53002499593143|
|    min|        -1417.0006|
|    max|         519.69806|
+-------+------------------+



In [22]:
df.select(df.case_late).distinct().show()

+---------+
|case_late|
+---------+
|      YES|
|       NO|
+---------+



## Using SQL commands on the df:

In [23]:
df.createOrReplaceTempView('cases')

In [24]:
spark.sql('''
SELECT case_late, COUNT(*)
FROM cases
GROUP BY case_late
''').show()

+---------+--------+
|case_late|count(1)|
+---------+--------+
|      YES|   94503|
|       NO|  747201|
+---------+--------+



In [25]:
df.groupBy(df.case_late)

<pyspark.sql.group.GroupedData at 0x11349a128>

In [26]:
df.groupBy(df.case_late).count().show()

+---------+------+
|case_late| count|
+---------+------+
|      YES| 94503|
|       NO|747201|
+---------+------+



or another way...

In [27]:
from pyspark.sql.functions import col, expr, count
df.groupBy(col('case_late')).agg(expr('count(*) AS n_cases')).show()

+---------+-------+
|case_late|n_cases|
+---------+-------+
|      YES|  94503|
|       NO| 747201|
+---------+-------+



or yet another way...

In [28]:
df.groupBy(df.case_late).agg(count(df.case_late).alias('n_cases')).show()

+---------+-------+
|case_late|n_cases|
+---------+-------+
|      YES|  94503|
|       NO| 747201|
+---------+-------+

