In [1]:
import findspark # You will need to pip install findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "MyShell") # Spark UI at http://localhost:4040/jobs/


In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [15]:
shipments = sqlContext.read.json("shipments.json")
carriers = sqlContext.read.json("carriers.json")

In [16]:
shipments.printSchema()

root
 |-- destination: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- pallets: long (nullable = true)
 |-- shipment: long (nullable = true)
 |-- value: long (nullable = true)



In [17]:
shipments.filter(shipments['origin'] == 'SAVI').show()


+-----------+------+-------+--------+-----+
|destination|origin|pallets|shipment|value|
+-----------+------+-------+--------+-----+
|        DCA|  SAVI|     70|     123|54171|
|        IAD|  SAVI|     33|     125|64432|
|        DCA|  SAVI|      1|     154|93095|
|        BWI|  SAVI|     68|     160|89950|
+-----------+------+-------+--------+-----+



In [19]:
shipments.filter(shipments['pallets']>50).join(carriers, shipments.shipment == carriers.shipment).show()

+-----------+------+-------+--------+-----+-------+--------+
|destination|origin|pallets|shipment|value|carrier|shipment|
+-----------+------+-------+--------+-----+-------+--------+
|        DCA|  SAVI|     70|     123|54171|   YODA|     123|
|       SAVI|   IAD|     68|     130|28564|   DART|     130|
|       SAVI|   BWI|    100|     153|84057|   USXI|     153|
|        BWI|  SAVI|     68|     160|89950|   YODA|     160|
|        IAD|   DCA|     56|     162|99620|   SCNN|     162|
+-----------+------+-------+--------+-----+-------+--------+



In [20]:
shipments.agg({"value": "max"}).collect()

[Row(max(value)=99620)]

In [23]:
x = shipments.join(carriers, shipments.shipment == carriers.shipment)

In [31]:
s = shipments
s.crosstab("origin", "destination").show()

+------------------+---+----+---+---+
|origin_destination|BWI|SAVI|IAD|DCA|
+------------------+---+----+---+---+
|               BWI|  0|   1|  0|  0|
|               IAD|  0|   1|  0|  0|
|               DCA|  1|   2|  1|  0|
|              SAVI|  1|   0|  1|  2|
+------------------+---+----+---+---+



In [33]:
s.describe().show()

+-------+-----------------+-----------------+------------------+
|summary|          pallets|         shipment|             value|
+-------+-----------------+-----------------+------------------+
|  count|               10|               10|                10|
|   mean|             44.2|            146.2|           72221.2|
| stddev|32.67278854201323|15.58346702259945|21604.252121592486|
|    min|                1|              123|             28564|
|    max|              100|              165|             99620|
+-------+-----------------+-----------------+------------------+



In [39]:
s.groupBy('origin').avg('pallets', 'value').show()

+------+------------+----------+
|origin|avg(pallets)|avg(value)|
+------+------------+----------+
|   DCA|        25.5|  76985.75|
|  SAVI|        43.0|   75412.0|
|   BWI|       100.0|   84057.0|
|   IAD|        68.0|   28564.0|
+------+------------+----------+



In [60]:
x.groupBy('carrier').avg('value').sort('avg(value)').show()

+-------+-----------------+
|carrier|       avg(value)|
+-------+-----------------+
|   DART|          46498.0|
|   USXI|          73002.0|
|   YODA|          75418.0|
|   SCNN|85652.66666666667|
+-------+-----------------+

