In [1]:
sc

<pyspark.context.SparkContext at 0x7fecea20a5c0>

In [2]:
!rm ./metastore_db/*.lck

rm: cannot remove './metastore_db/*.lck': No such file or directory


In [3]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## CSV

In [4]:
!wget https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv

--2017-02-02 16:47:14--  https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/databricks/spark-csv/master/src/test/resources/cars.csv [following]
--2017-02-02 16:47:15--  https://raw.githubusercontent.com/databricks/spark-csv/master/src/test/resources/cars.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134 [text/plain]
Saving to: ‘cars.csv’


2017-02-02 16:47:15 (29.1 MB/s) - ‘cars.csv’ saved [134/134]



In [5]:
df_cars = sqlc.read.format("com.databricks.spark.csv") \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .load("cars.csv")

In [6]:
df_cars.show()

+----+-----+-----+--------------------+-----+
|year| make|model|             comment|blank|
+----+-----+-----+--------------------+-----+
|2012|Tesla|    S|          No comment|     |
|1997| Ford| E350|Go get one now th...|     |
|2015|Chevy| Volt|                null| null|
+----+-----+-----+--------------------+-----+



In [7]:
df_cars.printSchema()

root
 |-- year: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- model: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- blank: string (nullable = true)



In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

customSchema = StructType([StructField("year", StringType(), True),
                           StructField("make", StringType(), True),
                           StructField("model", StringType(), True), 
                           StructField("comment", StringType(), True),
                           StructField("blank", StringType(), True)])

In [9]:
df_cars2 = sqlc.read.load(path="cars.csv", 
                          format="com.databricks.spark.csv", 
                          schema=customSchema,
                          header=True)

In [10]:
df_cars2.printSchema()

root
 |-- year: string (nullable = true)
 |-- make: string (nullable = true)
 |-- model: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- blank: string (nullable = true)



In [11]:
!rm -rf newcars.csv

selectedData = df_cars.select("year", "model","comment")
selectedData.coalesce(1).write.format("com.databricks.spark.csv") \
                        .option("header", "true") \
                        .option("nullValue","NA") \
                        .save("newcars.csv") \

In [15]:
!ls -l newcars.csv

total 4
-rw-r--r-- 1 dsi-student dsi-student 95 Feb  2 16:47 part-r-00000-3c303ffa-76ac-453a-a731-a0ec829de174.csv
-rw-r--r-- 1 dsi-student dsi-student  0 Feb  2 16:47 _SUCCESS


In [16]:
!rm -rf newcars.csv.gz
selectedData.write.format("com.databricks.spark.csv") \
                    .option("header", "true") \
                    .option("codec", "gzip") \
                    .save("newcars.csv.gz")

In [17]:
!ls -l newcars.csv.gz

total 4
-rw-r--r-- 1 dsi-student dsi-student 104 Feb  2 16:50 part-r-00000-61ce5cad-fe59-45e3-9543-7a406af8e3e0.csv.gz
-rw-r--r-- 1 dsi-student dsi-student   0 Feb  2 16:50 _SUCCESS


## XML

In [18]:
!wget https://github.com/databricks/spark-xml/raw/master/src/test/resources/books.xml

--2017-02-02 16:51:52--  https://github.com/databricks/spark-xml/raw/master/src/test/resources/books.xml
Resolving github.com (github.com)... 192.30.253.113, 192.30.253.112
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/databricks/spark-xml/master/src/test/resources/books.xml [following]
--2017-02-02 16:51:53--  https://raw.githubusercontent.com/databricks/spark-xml/master/src/test/resources/books.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5542 (5.4K) [text/plain]
Saving to: ‘books.xml’


2017-02-02 16:51:53 (2.42 MB/s) - ‘books.xml’ saved [5542/5542]



In [19]:
!cat books.xml

<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>


         An in-depth look at creating applications
         with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage,
         and query XML data in the database.


         After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository,
         the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB
         application. It provides examples of how and where you can use Oracle XML DB.


         The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating
         XMLType data, and ways you can view, generate

In [20]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .load("books.xml")

In [21]:
df_books.printSchema()

root
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- title: string (nullable = true)



In [22]:
df_books.show()

+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|  _id|              author|         description|          genre|price|publish_date|               title|
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|


         An in...|       Computer|44.95|  2000-10-01|XML Developer's G...|
|bk102|          Ralls, Kim|A former architec...|        Fantasy| 5.95|  2000-12-16|       Midnight Rain|
|bk103|         Corets, Eva|After the collaps...|        Fantasy| 5.95|  2000-11-17|     Maeve Ascendant|
|bk104|         Corets, Eva|In post-apocalyps...|        Fantasy| 5.95|  2001-03-10|     Oberon's Legacy|
|bk105|         Corets, Eva|The two daughters...|        Fantasy| 5.95|  2001-09-10|  The Sundered Grail|
|bk106|    Randall, Cynthia|When Carla meets ...|        Romance| 4.95|  2000-09-02|         Lover Birds|
|bk107|      Thurman, Paula|A deep sea diver .

In [23]:
!rm -rf newbooks.xml

selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .save("newbooks.xml")

In [24]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

customSchema = StructType([StructField("_id", StringType(), nullable = True), 
                           StructField("author", StringType(), nullable = True),
                           StructField("description", StringType(), nullable = True),
                           StructField("genre", StringType(),nullable = True), 
                           StructField("price", DoubleType(), nullable = True),
                           StructField("publish_date", StringType(), nullable = True),
                           StructField("title", StringType(), nullable = True)])

In [25]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .schema(customSchema) \
                    .load("books.xml")
            
selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .mode("overwrite") \
                .save("newbooks.xml")