# WPSR Query Using PySpark

The following query will use the PySparkSQL engine to read and perform a
query on Reporters contained within the file.


In [1]:
import os
import sys
import time

from os.path import expanduser

import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as f

In [2]:
# Simple function to get file size
def getSize(filename):
    st = os.stat(filename)
    return st.st_size


# File location and size variables
# Adjust the path as needed
home_dir = expanduser("~")
parquet_file = os.path.join(home_dir, 'Dev/Data/wspr/wsprspots-2020-02.parquet')
file_size = getSize(parquet_file)


# Setup the Spark Cluster Config Variables
conf = SparkConf().setAppName("Radio Data Science - WSPR Read Tests").setMaster("local[*]")


# Instantiate the Spark Session
spark = SparkSession \
    .builder \
    .appName("Radio Data Science - Parquet Test") \
    .config(conf=conf) \
    .getOrCreate()


# Print some basic headers and process the file
print(f'\n* Reading file ..: {os.path.basename(parquet_file)}')
print(f'* File Size .....: {file_size:,} bytes compressed')
start = time.time()
df = spark.read.load(parquet_file, format="parquet")
end = time.time()
print(f"* Read Time .....: {round((end-start), 5)} sec")


# This is a second trip through the file to get a total count
print(f'\n* Counting Records')
start = time.time()
print(f'* Record Count ..: {df.count():,}')
end = time.time()
print(f"* Count Time ....: {round((end-start), 5)} sec")


# This is a group by aggregate for the top 20 by count
print(f'\n* Running Group by Count Query and return the dataframe')
start = time.time()
df2 = df.groupBy('Reporter').count().orderBy('count', ascending=False).show(20)
end = time.time()
print(f"* Query Time ....: {round((end-start), 5)} sec\n")

# Shutdown the PySpark engine.
spark.stop()



* Reading file ..: wsprspots-2020-02.parquet
* File Size .....: 490,259,730 bytes compressed
* Read Time .....: 1.51541 sec

* Counting Records
* Record Count ..: 47,310,649
* Count Time ....: 0.89148 sec

* Running Group by Count Query and return the dataframe
+--------+------+
|Reporter| count|
+--------+------+
|   DK6UG|838081|
|  OE9GHV|690104|
|  EA8BFK|648670|
|   KD2OM|589003|
|KA7OEI-1|576788|
|   K4RCG|571445|
|     KPH|551690|
|    K9AN|480759|
|   DF5FH|480352|
|   DJ9PC|474211|
|  HB9TMC|472900|
|    ND7M|461383|
|  IW2NKE|455781|
|    WO7I|437582|
|   ON5KQ|427628|
|  N6GN/K|361590|
|  WA2ZKD|328003|
|  KJ6MKI|318174|
|   LX1DQ|309909|
|   W2GNN|308290|
+--------+------+
only showing top 20 rows

* Query Time ....: 4.89078 sec

