In [1]:
sc

<pyspark.context.SparkContext at 0x7f8e2acbd5c0>

In [2]:
!rm ./metastore_db/*.lck
examples_folder = "/usr/local/share/spark/examples/src/main/resources/"

In [3]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Parquet Files

In [4]:
df = sqlc.read.load(examples_folder + "users.parquet")
df.show()

AnalysisException: 'Path does not exist: file:/usr/local/share/spark/examples/src/main/resources/users.parquet;'

In [6]:
!rm -rf namesAndFavColors.parquet/
df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")

In [7]:
!ls -l namesAndFavColors.parquet/

total 4
-rw------- 1 dvgodoy dvgodoy 509 Set 25 17:04 part-r-00000-876c04e5-04ce-44a9-a451-ec2074838894.snappy.parquet
-rw------- 1 dvgodoy dvgodoy   0 Set 25 17:04 _SUCCESS


In [8]:
sqlc.read.format("org.apache.spark.sql.parquet").load("users.parquet")

DataFrame[name: string, favorite_color: string, favorite_numbers: array<int>]

In [9]:
sqlc.read.format("parquet").load("users.parquet")

DataFrame[name: string, favorite_color: string, favorite_numbers: array<int>]

In [10]:
sqlc.read.parquet("users.parquet")

DataFrame[name: string, favorite_color: string, favorite_numbers: array<int>]

## JSON Files

In [21]:
df_json = sqlc.read.format("json").load(examples_folder + "people.json")
df_json.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [22]:
df_json.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [23]:
json_strings = df_json.toJSON()
json_strings.collect()

[u'{"name":"Michael"}',
 u'{"age":30,"name":"Andy"}',
 u'{"age":19,"name":"Justin"}']

In [25]:
!rm -rf namesAndAges.parquet/
df_json.select("name", "age").write.format("parquet").save("namesAndAges.parquet")

In [26]:
!ls -l namesAndAges.parquet

total 1
-rw------- 1 dvgodoy dvgodoy 522 Set 25 17:09 part-r-00000-931a292f-ba66-4df2-8f10-87a8abb42289.snappy.parquet
-rw------- 1 dvgodoy dvgodoy   0 Set 25 17:09 _SUCCESS


In [28]:
!rm -rf namesAndAgesCoalesced.parquet/
df_json.select("name", "age").coalesce(1).write.format("parquet").save("namesAndAgesCoalesced.parquet")

In [29]:
!ls -l namesAndAgesCoalesced.parquet

total 1
-rw------- 1 dvgodoy dvgodoy 522 Set 25 17:10 part-r-00000-038eb1d1-1907-4923-a0ea-d60dc1401151.snappy.parquet
-rw------- 1 dvgodoy dvgodoy   0 Set 25 17:10 _SUCCESS


In [30]:
df_json.write.mode("error").parquet("namesAndAges.parquet")
# This IS supposed to throw an exception, as it is trying to write an already existent file!

AnalysisException: u'path file:/media/dvgodoy/FILES/DataScienceRetreat/SparkClass/namesAndAges.parquet already exists.;'

In [31]:
df_json.write.mode("overwrite").parquet("namesAndAges.parquet")

## SQL on Files

In [32]:
df = sqlc.sql("SELECT * FROM json.`people.json` WHERE age = 19")
df.show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



## MySQL

In [33]:
# Run this commands in MYSQL

# CREATE TABLE users (user_id int PRIMARY KEY, fname text, lname text);
# INSERT INTO users (user_id,  fname, lname) VALUES (1, 'john', 'smith');
# INSERT INTO users (user_id,  fname, lname) VALUES (2, 'john', 'doe');
# INSERT INTO users (user_id,  fname, lname) VALUES (3, 'john', 'smith');

In [35]:
df_mysql = sqlc.read.format("jdbc") \
                .option("url", "jdbc:mysql://localhost:3306/test") \
                .option("driver", "com.mysql.jdbc.Driver") \
                .option("dbtable", "users") \
                .option("user", "root") \
                .option("password", "").load()

In [36]:
df_mysql.show()

+-------+-----+-----+
|user_id|fname|lname|
+-------+-----+-----+
|   1744| john|  doe|
|   1745| john|smith|
|   1746| john|smith|
+-------+-----+-----+



In [37]:
df_mysql.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)



In [38]:
df_mysql.registerTempTable("users")

In [40]:
sqlc.sql("select * from users").collect()

[Row(user_id=1744, fname=u'john', lname=u'doe'),
 Row(user_id=1745, fname=u'john', lname=u'smith'),
 Row(user_id=1746, fname=u'john', lname=u'smith')]