# RDDs

## Download and install Spark

In [None]:
!ls

In [None]:
#!apt-get update
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
#!tar xf spark-2.3.1-bin-hadoop2.7.tgz
#!pip install -q findspark

## Setup environment

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading Chicago's police station dataset

In [None]:
!wget -O police-stations.csv https://data.cityofchicago.org/api/views/z8bn-74gv/rows.csv?accessType=DOWNLOAD
!ls -l

## RDDs setup

In [None]:
# Resilient Distributed Datasets are immutable partitioned collection of records that can be worked on in parallel
# each record is a structured row containing fields with a known schema
# Records are java, scala, or python objects
# You will need to manually recreate any optimizations
# Python and Scala DataFrames have similar performance
# Transformations return RDDS (eg. map, flatMap, filter)
# Actions return another data type (eg. reduce, count)

In [None]:
psrdd = sc.textFile('polce-stations.csv')
psrdd.first()       # view first row of police station rdd

In [None]:
ps_header = psrdd.first()       # assigns headers to variable

In [None]:
ps_rest = psrdd.filter(lambda line: line != ps_header)
ps_rest.first()     # first row thats not header

In [None]:
ps_rest.map(lambda line: line.split(',')).collect()     # replace collect with count() to count all rows

**How many police stations are there?**

**Display the District ID, District name, Address and Zip for the police station with District ID 7**



In [None]:
ps_rest.filter(lambda line: line.split(',')[0]=='7').   # condition where first column = 7
map(lambda line: (line.split(',')[0],   # District ID
line.split(',')[1],                     # District Name
line.split(',')[2],                     # Adress
line.split(',')[5]                      # Zip code
)).collect()

**Police stations 10 and 11 are geographically close to each other. Display the District ID, District name, address and zip code**

In [None]:
ps_rest.filter(lambda line: line.split(',')[0] in ['10','11']).
map(lambda line: (line.split(',')[1], line.split(',')[2], line.split(',')[5])).collect())
