# Getting Started With Spark using Python

In [2]:
# Installing required packages
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
     |████████████████████████████████| 281.4 MB 13 kB/s              █▊                        | 68.1 MB 63.7 MB/s eta 0:00:04MB 63.7 MB/s eta 0:00:04��████▏                      | 80.4 MB 67.3 MB/s eta 0:00:03MB 67.3 MB/s eta 0:00:03 |████████████████▍               | 144.2 MB 62.3 MB/s eta 0:00:03 | 154.3 MB 62.3 MB/s eta 0:00:03��████████████████████▎         | 195.7 MB 5.7 MB/s eta 0:00:15 
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
     |████████████████████████████████| 198 kB 62.3 MB/s            
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=467d8acf1e9e54b040f65132856aaee9103a713905e514c7665eda26d812ad2d
  Stored in directory: /home/jupyterlab/.cache/pi

In [3]:
import findspark
findspark.init()

In [4]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [5]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

22/02/13 13:13:41 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
spark

In [7]:
data = range(1,30)
# print first element of iterator
print(data[0])
len(data)
xrangeRDD = sc.parallelize(data, 4)

# this will let us know that we created an RDD
xrangeRDD

1


PythonRDD[1] at RDD at PythonRDD.scala:53

In [8]:
subRDD = xrangeRDD.map(lambda x: x-1)
filteredRDD = subRDD.filter(lambda x : x<10)

In [9]:
print(filteredRDD.collect())
filteredRDD.count()

                                                                                

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


10

In [10]:
import time 

test = sc.parallelize(range(1,50000),4)
test.cache()

t1 = time.time()
# first count will trigger evaluation of count *and* cache
count1 = test.count()
dt1 = time.time() - t1
print("dt1: ", dt1)


t2 = time.time()
# second count operates on cached data only
count2 = test.count()
dt2 = time.time() - t2
print("dt2: ", dt2)

#test.count()

                                                                                

dt1:  0.9988055229187012
dt2:  0.3607344627380371


In [11]:
spark

In [12]:
# Download the data first into a local `people.json` file
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/data/people.json >> people.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    73  100    73    0     0    246      0 --:--:-- --:--:-- --:--:--   246


In [None]:
# Read the dataset into a spark dataframe using the `read.json()` function
df = spark.read.json("people.json").cache()

In [None]:
# Print the dataframe as well as the data schema
df.show()
df.printSchema()

In [None]:
# Register the DataFrame as a SQL temporary view
df.createTempView("people")

In [None]:
# Select and show basic data columns

df.select("name").show()
df.select(df["name"]).show()
spark.sql("SELECT name FROM people").show()

In [None]:
# Perform basic filtering

df.filter(df["age"] > 21).show()
spark.sql("SELECT age, name FROM people WHERE age > 21").show()

In [None]:
# Perfom basic aggregation of data

df.groupBy("age").count().show()
spark.sql("SELECT age, COUNT(age) as count FROM people GROUP BY age").show()

In [None]:
##########Exercise - Question 1 - RDDs

In [None]:
# starter code
spark
# numbers = range(1, 50)
data = range(1,50)

# numbers_RDD = ...
print(data[0])
len(data)

# even_numbers_RDD = numbers_RDD.map(lambda x: ..)
xrangeRDD = sc.parallelize(data, 4)

# Code block for learners to answer
# this will let us know that we created an RDD
xrangeRDD

#### Exercise - Question 2 - DataFrames and SparkSQL

In [None]:
# starter code
spark

!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/people2.json >> people2.json

# df = spark.read...
df = spark.read.json("people2.json").cache()

# df.createTempView..
df.createTempView("people2")

# spark.sql("SELECT ...")
df.select("name").show()
df.select(df["name"]).show()
spark.sql("SELECT name FROM people2").show()

In [None]:
df.select(df["age"]).show()
spark.sql("Select AVG(age) from people2").show()

#### Exercise - Question 3 - SparkSession

In [5]:
spark.stop()

NameError: name 'spark' is not defined

##                                **The End**

|  Compiled By      |    Last Update    |    Social Media   |
| ----------------- | ----------------- | ----------------- |
|  Jeremias Tivane  |    12-02-2022     | [Linkedin](https://www.linkedin.com/in/jeremiastivane/) | 
|  Jeremias Tivane  |    14-02-2022     | [Github](https://github.com/Jeremias-Tivane) | 
      