## Extraction and Preprocessing of Data for Visualization 

#### Requirements
This notebook was developed and tested on *** within the following environment.  


```
conda ...
```

In [2]:
#import pandas as pd
#import numpy as np
#import geopandas as gpd
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql.functions import *

#### Data
Historical crime data of the City of Los Angeles for all crimes recorded from 2010 to 2021 should be downloaded from [Los Angeles open data](https://data.lacity.org/Public-Safety/Number-of-crimes-2010-today/rvrw-58iu) as CSV into the working directory and renamed to `crime_big.csv`. Please note the file is about 500 MB.  
Data is loaded as a Spark Dataframe for further cleaning.

In [3]:
sc = pyspark.context.SparkContext('local')
spark = SparkSession(sc)

In [89]:
#input_path = "data_sample.csv" # test dataset

input_path = "crime_big.csv"
crime_df = spark.read.csv(input_path, header=True,inferSchema=True)

print("Record Count: ", crime_df.count())
crime_df.printSchema()

Record Count:  2060948
root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer 

----
Select the needed columns to refine the data. Then filter out the records with NA/NULL values.

In [93]:
# Select colunmns and rename
crime_clean = crime_df.select('Date OCC', 'Crm Cd Desc', 'LAT', 'LON', 'AREA NAME',
                            'Vict Age','Vict Sex','Vict Descent','Weapon Used Cd')\
    .withColumnRenamed('Date OCC', 'date').withColumnRenamed('Crm Cd Desc', 'crime type')\
    .withColumnRenamed('LAT', 'latitude').withColumnRenamed('LON', 'longitude').withColumnRenamed('AREA NAME', 'area')

# Victim info
crime_clean = crime_clean.withColumnRenamed('Vict Age', 'victim age').withColumnRenamed('Vict Sex', 'victim sex')\
    .withColumnRenamed('Vict Descent', 'victim descent').withColumnRenamed('Weapon Used Cd', 'weapon type')

In [94]:
# Exclude zero and null value
crime_clean = crime_clean.where(crime_clean['date'].isNotNull()).where(crime_clean['crime type'].isNotNull())\
.where(crime_clean['area'].isNotNull())\
.where(crime_clean['latitude'].isNotNull() & (crime_clean['latitude'] != 0))\
.where(crime_clean['longitude'].isNotNull() & (crime_clean['longitude'] != 0))

print("Record Count: ", crime_clean.count())

Record Count:  2058998


In [95]:
# truncate string in date and crime type to get precise result
crime_clean = crime_clean.withColumn('date', substring(crime_clean["date"], 1, 10))\
    .withColumn('crime type', split(crime_clean["crime type"], pattern=" |,").getItem(0))

# convert string to date format
crime_clean = crime_clean.withColumn("date", to_date(crime_clean["date"], "MM/dd/yyyy"))

# extract day, month, year
crime_clean = crime_clean.withColumn("month", date_format(crime_clean["date"], "M"))\
    .withColumn("day", date_format(crime_clean["date"], "d"))\
    .withColumn("year", date_format(crime_clean["date"], "y"))

crime_clean.show(10)

+----------+-----------+--------+---------+---------+----------+----------+--------------+-----------+-----+---+----+
|      date| crime type|latitude|longitude|     area|victim age|victim sex|victim descent|weapon type|month|day|year|
+----------+-----------+--------+---------+---------+----------+----------+--------------+-----------+-----+---+----+
|2010-02-20|  VIOLATION| 33.9825|-118.2695|   Newton|        48|         M|             H|       null|    2| 20|2010|
|2010-09-12|  VANDALISM| 33.9599|-118.3962|  Pacific|         0|         M|             W|       null|    9| 12|2010|
|2010-08-09|      OTHER| 34.0224|-118.2524|   Newton|         0|         M|             H|       null|    8|  9|2010|
|2010-01-05|  VIOLATION| 34.1016|-118.3295|Hollywood|        47|         F|             W|        102|    1|  5|2010|
|2010-01-02|       RAPE| 34.0387|-118.2488|  Central|        47|         F|             H|        400|    1|  2|2010|
|2010-01-04|SHOPLIFTING|  34.048|-118.2577|  Central|   

In [100]:
# write cleaned data to csv
output_path = "./crime_clean.csv"
crime_clean.toPandas().to_csv(output_path, index=False)

In [88]:
# count crime cases group by area
def getCases(df):
    case_df = df.groupBy("area").agg({"crime type":"count"}).withColumnRenamed("count(crime type)", "crime cases")
    
    return case_df