# Some Title

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()

In [2]:
spark

### Read CSV

In [3]:
data_path = 'data/iris.csv'
df1 = spark.read.format('csv').option('header', 'true').load(data_path)

In [4]:
df1.head(10)

[Row(sepal_length='5.1', sepal_width='3.5', petal_length='1.4', petal_width='0.2', species='setosa'),
 Row(sepal_length='4.9', sepal_width='3.0', petal_length='1.4', petal_width='0.2', species='setosa'),
 Row(sepal_length='4.7', sepal_width='3.2', petal_length='1.3', petal_width='0.2', species='setosa'),
 Row(sepal_length='4.6', sepal_width='3.1', petal_length='1.5', petal_width='0.2', species='setosa'),
 Row(sepal_length='5.0', sepal_width='3.6', petal_length='1.4', petal_width='0.2', species='setosa'),
 Row(sepal_length='5.4', sepal_width='3.9', petal_length='1.7', petal_width='0.4', species='setosa'),
 Row(sepal_length='4.6', sepal_width='3.4', petal_length='1.4', petal_width='0.3', species='setosa'),
 Row(sepal_length='5.0', sepal_width='3.4', petal_length='1.5', petal_width='0.2', species='setosa'),
 Row(sepal_length='4.4', sepal_width='2.9', petal_length='1.4', petal_width='0.2', species='setosa'),
 Row(sepal_length='4.9', sepal_width='3.1', petal_length='1.5', petal_width='0.1',

In [5]:
df1.show(10)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



In [6]:
# number of rows
df1.count()

150

In [7]:
# Load CSV with no header and automatically infer schema
data_path = 'data/iris_nohead.csv'
df2 = spark.read.format('csv').option('header', 'false')\
                              .option('inferSchema', 'true')\
                              .load(data_path)

In [8]:
df2.show(10)

+---+---+---+---+------+
|_c0|_c1|_c2|_c3|   _c4|
+---+---+---+---+------+
|5.1|3.5|1.4|0.2|setosa|
|4.9|3.0|1.4|0.2|setosa|
|4.7|3.2|1.3|0.2|setosa|
|4.6|3.1|1.5|0.2|setosa|
|5.0|3.6|1.4|0.2|setosa|
|5.4|3.9|1.7|0.4|setosa|
|4.6|3.4|1.4|0.3|setosa|
|5.0|3.4|1.5|0.2|setosa|
|4.4|2.9|1.4|0.2|setosa|
|4.9|3.1|1.5|0.1|setosa|
+---+---+---+---+------+
only showing top 10 rows



In [9]:
df2 = df2.withColumnRenamed('_c0', 'sepal_length') \
         .withColumnRenamed('_c1', 'sepal_width') \
         .withColumnRenamed('_c2', 'petal_length') \
         .withColumnRenamed('_c3', 'petal_width') \
         .withColumnRenamed('_c4', 'species') 

In [10]:
df2.show(10)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



### Load JSON

In [11]:
data_path = 'data/iris.json'
df3 = spark.read.json(data_path, multiLine = True)

In [12]:
df3.show(10)

+------------+-----------+------------+-----------+-------+
|petal_length|petal_width|sepal_length|sepal_width|species|
+------------+-----------+------------+-----------+-------+
|         1.4|        0.2|         5.1|        3.5| setosa|
|         1.4|        0.2|         4.9|        3.0| setosa|
|         1.3|        0.2|         4.7|        3.2| setosa|
|         1.5|        0.2|         4.6|        3.1| setosa|
|         1.4|        0.2|         5.0|        3.6| setosa|
|         1.7|        0.4|         5.4|        3.9| setosa|
|         1.4|        0.3|         4.6|        3.4| setosa|
|         1.5|        0.2|         5.0|        3.4| setosa|
|         1.4|        0.2|         4.4|        2.9| setosa|
|         1.5|        0.1|         4.9|        3.1| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



In [13]:
df3.count()

150

### Basic operations

In [14]:
data_path = 'data/california.csv'
df4 = spark.read.option('header', 'True') \
                .option('inferSchema', 'True') \
                .csv(data_path)

In [15]:
df4.columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [16]:
df4.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [17]:
df4.count()

20640

In [18]:
df4_sampled = df4.sample(False, fraction = 0.25)
df4_sampled.count()

5118

In [19]:
df4_sorted = df4_sampled.sort('median_house_value')

### Filter a dataframe

In [20]:
df4.filter(df4['ocean_proximity']=='INLAND').count()

6551

In [21]:
df4.filter(df4['ocean_proximity']=='NEAR BAY').count()

2290

### Aggregate data

In [22]:
df4.groupBy('ocean_proximity').count().show()

+---------------+-----+
|ocean_proximity|count|
+---------------+-----+
|         ISLAND|    5|
|     NEAR OCEAN| 2658|
|       NEAR BAY| 2290|
|      <1H OCEAN| 9136|
|         INLAND| 6551|
+---------------+-----+



In [23]:
df4.groupBy('ocean_proximity').agg({'median_house_value': 'mean'}).show(10)

+---------------+-----------------------+
|ocean_proximity|avg(median_house_value)|
+---------------+-----------------------+
|         ISLAND|               380440.0|
|     NEAR OCEAN|     249433.97742663656|
|       NEAR BAY|     259212.31179039303|
|      <1H OCEAN|     240084.28546409807|
|         INLAND|     124805.39200122119|
+---------------+-----------------------+



### Simple statistics

In [24]:
df4.describe(['median_house_value']).show()

+-------+------------------+
|summary|median_house_value|
+-------+------------------+
|  count|             20640|
|   mean|206855.81690891474|
| stddev|115395.61587441359|
|    min|           14999.0|
|    max|          500001.0|
+-------+------------------+



In [25]:
df4.select('ocean_proximity').distinct().show()

+---------------+
|ocean_proximity|
+---------------+
|         ISLAND|
|     NEAR OCEAN|
|       NEAR BAY|
|      <1H OCEAN|
|         INLAND|
+---------------+



In [26]:
df4.select('ocean_proximity').distinct().count()

5

In [27]:
df4.corr('median_house_value', 'median_income')

0.6880752079585578

### Save dataframe to disk

In [28]:
# df4.write.csv('data/df4.csv', 'overwrite')
# df4.write.parquet('data/df4', 'overwrite')