In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("FileFormats")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/03/04 21:01:56 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/04 21:01:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/04 21:01:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# inferSchema will cause performace issue with loading
movieDf = spark.read.format('csv')\
                    .option('header', True)\
                    .option('inferSchema', True)\
                    .load("hdfs://localhost:9000/movies")

ratingDf = spark.read.format('csv')\
                    .option('header', True)\
                    .option('inferSchema', True)\
                    .load("hdfs://localhost:9000/ratings")

                                                                                

In [9]:
# convert movies to json
movieDf.write.mode("overwrite")\
         .json("hdfs://localhost:9000/movies-json")

ratingDf.write.mode("overwrite")\
         .json("hdfs://localhost:9000/ratings-json")

In [16]:
# READ JSON FILE into DataFrame
movieJsonDf = spark.read.format("json")\
                    .option('inferSchema', True)\
                    .load("hdfs://localhost:9000/movies-json")

movieJsonDf.printSchema()
movieJsonDf.show(5)

ratingJsonDf = spark.read.format("json")\
                    .option('inferSchema', True)\
                    .load("hdfs://localhost:9000/ratings-json")

ratingJsonDf.printSchema()
ratingJsonDf.show(5)

root
 |-- genres: string (nullable = true)
 |-- movieId: long (nullable = true)
 |-- title: string (nullable = true)

+--------------------+-------+--------------------+
|              genres|movieId|               title|
+--------------------+-------+--------------------+
|Adventure|Animati...|      1|    Toy Story (1995)|
|Adventure|Childre...|      2|      Jumanji (1995)|
|      Comedy|Romance|      3|Grumpier Old Men ...|
|Comedy|Drama|Romance|      4|Waiting to Exhale...|
|              Comedy|      5|Father of the Bri...|
+--------------------+-------+--------------------+
only showing top 5 rows

root
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- userId: long (nullable = true)

+-------+------+---------+------+
|movieId|rating|timestamp|userId|
+-------+------+---------+------+
|      1|   4.0|964982703|     1|
|      3|   4.0|964981247|     1|
|      6|   4.0|964982224|     1|
|     47|   5.0|964983815|   

In [10]:
# parquet 
# Write dataframe into parquet format
movieDf.write.mode("overwrite")\
         .parquet("hdfs://localhost:9000/movies-parquet")

ratingDf.write.mode("overwrite")\
         .parquet("hdfs://localhost:9000/ratings-parquet")

In [14]:
# READ Parquet File 
# inferSchema not need as parquet has shema itself
movieParquetDf = spark.read.format("parquet")\
                    .load("hdfs://localhost:9000/movies-parquet")

movieParquetDf.printSchema()
movieParquetDf.show(5)

ratingParquetDf = spark.read.format("parquet")\
                    .load("hdfs://localhost:9000/ratings-parquet")

ratingParquetDf.printSchema()
ratingParquetDf.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47| 

In [12]:
# orc, optimized columar format
# Write datafrom into orc format
# https://orc.apache.org
movieDf.write.mode("overwrite")\
         .orc("hdfs://localhost:9000/movies-orc")

ratingDf.write.mode("overwrite")\
         .orc("hdfs://localhost:9000/ratings-orc")

                                                                                

In [15]:
# READ Orc File 
# inferSchema not need as orc has shema itself
movieOrcDf = spark.read.format("orc")\
                    .load("hdfs://localhost:9000/movies-orc")

movieOrcDf.printSchema()
movieOrcDf.show(5)

ratingOrcDf = spark.read.format("orc")\
                    .load("hdfs://localhost:9000/ratings-orc")

ratingOrcDf.printSchema()
ratingOrcDf.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47| 