## Extraction and Preprocessing of Data for Visualization 

#### Requirements
This notebook was developed and tested on *** within the following environment.  


```
conda ...
```

In [25]:
import pyspark
import pandas as pd
#import geopandas as gpd
import numpy as np
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql.functions import *

#### Data
Historical crime data of the City of Los Angeles for all crimes recorded from 2010 to 2021 should be downloaded from [Los Angeles open data](https://data.lacity.org/Public-Safety/Number-of-crimes-2010-today/rvrw-58iu) as CSV into the working directory and renamed to `crime_big.csv`. Please note the file is about 500 MB.  
Data is loaded as a Spark Dataframe for further cleaning.

In [None]:

sc = pyspark.context.SparkContext('local')
spark = SparkSession(sc)

In [7]:
### This cell may take up to 20s to run.
input_path = "crime_big.csv"
crime_df = spark.read.csv(input_path, header=True,inferSchema=True)
print("Record Count: ", crime_df.count())
crime_df.printSchema()

Record Count:  2060948
root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer 

----
Select the needed columns to refine the data. Then filter out the records with NA/NULL values.

In [60]:
# Select colunmns and rename
crime_clean = crime_df.select('Date OCC', 'Crm Cd Desc', 'LAT', 'LON')\
    .withColumnRenamed('Date OCC', 'date').withColumnRenamed('Crm Cd Desc', 'crime type')\
    .withColumnRenamed('LAT', 'latitude').withColumnRenamed('LON', 'longitude')

# convert date
#crime_clean = crime_clean.withColumn("date", to_date(crime_clean["date"], "MM/dd/yyyy HH:mm:ss a"))
crime_clean = crime_clean.withColumn("date", date_format(crime_clean["date"], "MM/dd/yyyy HH:mm:ss a"))

# extract day, month, year
"""
crime_clean = crime_clean.withColumn("day", date_format(crime_clean["date"], "dd"))\
    .withColumn("month", date_format(crime_clean["date"], "MM"))\
    .withColumn("year", date_format(crime_clean["date"], "yyyy"))
"""

crime_clean.show(10)



+----+--------------------+--------+---------+
|date|          crime type|latitude|longitude|
+----+--------------------+--------+---------+
|null|VIOLATION OF COUR...| 33.9825|-118.2695|
|null|VANDALISM - FELON...| 33.9599|-118.3962|
|null|OTHER MISCELLANEO...| 34.0224|-118.2524|
|null|VIOLATION OF COUR...| 34.1016|-118.3295|
|null|     RAPE, ATTEMPTED| 34.0387|-118.2488|
|null|SHOPLIFTING - PET...|  34.048|-118.2577|
|null|BURGLARY FROM VEH...| 34.0389|-118.2643|
|null|ASSAULT WITH DEAD...| 34.0435|-118.2427|
|null|ASSAULT WITH DEAD...|  34.045| -118.264|
|null|THEFT-GRAND ($950...| 34.0538|-118.2488|
|null|BATTERY - SIMPLE ...|  34.064|-118.2375|
|null|             ROBBERY|  34.035|-118.2386|
|null|VANDALISM - FELON...| 34.0409|-118.2609|
|null|          BOMB SCARE| 34.0502| -118.254|
|null|             ROBBERY| 34.0515|-118.2424|
|null|OTHER MISCELLANEO...| 34.0389| -118.255|
|null|CHILD NEGLECT (SE...| 34.0401|-118.2668|
|null|             ROBBERY| 34.0428|-118.2461|
|null|INTIMAT

In [None]:
# Exclude zero and null value
crime_clean = crime_clean.where(crime_clean['date'].isNotNull()).where(crime_clean['crime type'].isNotNull())\
.where(crime_clean['latitude'].isNotNull() & (crime_clean['latitude'] != 0))\
.where(crime_clean['lontitude'].isNotNull() & (crime_clean['lontitude'] != 0))
print("Record Count: ", crime_clean.count())

In [None]:
#crime_clean = crime_clean

crime_clean.show(10)