In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Tutorial").getOrCreate()

In [3]:
df_csv = spark.read.csv("201508_station_data.csv", header=True, inferSchema=True)

In [4]:
df_csv.show()

+----------+--------------------+---------+-----------+---------+------------+------------+
|station_id|                name|      lat|       long|dockcount|    landmark|installation|
+----------+--------------------+---------+-----------+---------+------------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|    San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|    San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|    San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|    San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|    San Jose|    8/7/2013|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|    San Jose|    8/7/2013|
|         8| San Salvador at 1st|37.330165|-121.885831|       15|    San Jose|    8/5/2013|
|         9|           Japantown|37.348742|-121.894715|       15|    San Jose|  

In [5]:
df_csv.head(5)

[Row(station_id=2, name='San Jose Diridon Caltrain Station', lat=37.329732, long=-121.901782, dockcount=27, landmark='San Jose', installation='8/6/2013'),
 Row(station_id=3, name='San Jose Civic Center', lat=37.330698, long=-121.888979, dockcount=15, landmark='San Jose', installation='8/5/2013'),
 Row(station_id=4, name='Santa Clara at Almaden', lat=37.333988, long=-121.894902, dockcount=11, landmark='San Jose', installation='8/6/2013'),
 Row(station_id=5, name='Adobe on Almaden', lat=37.331415, long=-121.8932, dockcount=19, landmark='San Jose', installation='8/5/2013'),
 Row(station_id=6, name='San Pedro Square', lat=37.336721, long=-121.894074, dockcount=15, landmark='San Jose', installation='8/7/2013')]

In [6]:
type(df_csv)

pyspark.sql.dataframe.DataFrame

In [7]:
df_csv.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dockcount: integer (nullable = true)
 |-- landmark: string (nullable = true)
 |-- installation: string (nullable = true)



In [8]:
df_csv.columns

['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']

In [9]:
df_csv.select('station_id').show()

+----------+
|station_id|
+----------+
|         2|
|         3|
|         4|
|         5|
|         6|
|         7|
|         8|
|         9|
|        10|
|        11|
|        12|
|        13|
|        14|
|        16|
|        21|
|        22|
|        23|
|        24|
|        25|
|        26|
+----------+
only showing top 20 rows



In [10]:
df_csv.select('station_id', 'name').show()

+----------+--------------------+
|station_id|                name|
+----------+--------------------+
|         2|San Jose Diridon ...|
|         3|San Jose Civic Ce...|
|         4|Santa Clara at Al...|
|         5|    Adobe on Almaden|
|         6|    San Pedro Square|
|         7|Paseo de San Antonio|
|         8| San Salvador at 1st|
|         9|           Japantown|
|        10|  San Jose City Hall|
|        11|         MLK Library|
|        12|SJSU 4th at San C...|
|        13|       St James Park|
|        14|Arena Green / SAP...|
|        16|SJSU - San Salvad...|
|        21|   Franklin at Maple|
|        22|Redwood City Calt...|
|        23|San Mateo County ...|
|        24|Redwood City Publ...|
|        25|Stanford in Redwo...|
|        26|Redwood City Medi...|
+----------+--------------------+
only showing top 20 rows



In [11]:
df_csv.dtypes

[('station_id', 'int'),
 ('name', 'string'),
 ('lat', 'double'),
 ('long', 'double'),
 ('dockcount', 'int'),
 ('landmark', 'string'),
 ('installation', 'string')]

In [12]:
df_csv.describe().show()

+-------+------------------+--------------------+-------------------+-------------------+-----------------+-------------+------------+
|summary|        station_id|                name|                lat|               long|        dockcount|     landmark|installation|
+-------+------------------+--------------------+-------------------+-------------------+-----------------+-------------+------------+
|  count|                70|                  70|                 70|                 70|               70|           70|          70|
|   mean|              43.0|                null|  37.59024338428572|-122.21841616428571|17.65714285714286|         null|        null|
| stddev|24.166091947189145|                null|0.20347253639672416| 0.2094460497964454|4.010441857493954|         null|        null|
|    min|                 2|       2nd at Folsom|          37.329732|        -122.418954|               11|Mountain View|   1/22/2014|
|    max|                84|Yerba Buena Cente...|      

In [13]:
df_csv.filter(df_csv.name == 'San Pedro Square').show()

+----------+----------------+---------+-----------+---------+--------+------------+
|station_id|            name|      lat|       long|dockcount|landmark|installation|
+----------+----------------+---------+-----------+---------+--------+------------+
|         6|San Pedro Square|37.336721|-121.894074|       15|San Jose|    8/7/2013|
+----------+----------------+---------+-----------+---------+--------+------------+



In [15]:
df_csv.filter(df_csv.dockcount == 19).count()

18