<img src= "/files/tables/avatar.jpg" width="100" height="100" />
 
```

Name:         3-nb-read-n-write-data

Design Phase:
    Author:   John Miner
    Date:     12-01-2020
    Purpose:  How to read and write with spark dataframes

Learning Guide:
    1 - Create local data lake
    2 - Read + Write CSV files (infer data)
    3 - Read + Write CSV files (infer data)
    4 - Read + Write Parquet Files
    5 - Read + Write TSV files
    
```

In [0]:
%run "./n-tool-box-code"

In [0]:
#
# 1 - Create sample data lake
#

# remove existing
try:
  dbutils.fs.rm("/lake", recurse=True)  
except:
  pass

# root
dbutils.fs.mkdirs("/lake")

# amazon data
dbutils.fs.mkdirs("/lake/bronze/amazon")

# diamond data
dbutils.fs.mkdirs("/lake/bronze/diamonds")

# power plant
dbutils.fs.mkdirs("/lake/bronze/power")

# weather
dbutils.fs.mkdirs("/lake/bronze/weather")

# folders
dbutils.fs.ls("/lake/bronze")



In [0]:
#
# 2 - Read + write csv data - no schema
#


In [0]:
# Two files
dbutils.fs.ls("/databricks-datasets/weather")


In [0]:
# low temps
dbutils.fs.head("/databricks-datasets/weather/low_temps", 500)


In [0]:
# high temps
dbutils.fs.head("/databricks-datasets/weather/high_temps", 500)


In [0]:
# read in low temps
path1 = "/databricks-datasets/weather/low_temps"
df1 = (
  spark.read                    
  .option("sep", ",")        
  .option("header", "true")
  .option("inferSchema", "true")  
  .csv(path1)               
)

In [0]:
# rename columns
df1 = df1.withColumnRenamed("temp", "low_temp")


In [0]:
# show top 5 rows
display(df1.head(5))

date,low_temp
2015-01-01T00:00:00.000+0000,26
2015-01-02T00:00:00.000+0000,32
2015-01-03T00:00:00.000+0000,35
2015-01-04T00:00:00.000+0000,38
2015-01-05T00:00:00.000+0000,49


In [0]:
# read in high temps
path2 = "/databricks-datasets/weather/high_temps"
df2 = (
  spark.read                    
  .option("sep", ",")        
  .option("header", "true")
  .option("inferSchema", "true")  
  .csv(path2)               
)

In [0]:
# rename columns
df2 = df2.withColumnRenamed("temp", "high_temp")
df2 = df2.withColumnRenamed("date", "date2")


In [0]:
# show top 5 rows
display(df2.head(5))

date2,high_temp
2015-01-01T00:00:00.000+0000,42
2015-01-02T00:00:00.000+0000,42
2015-01-03T00:00:00.000+0000,41
2015-01-04T00:00:00.000+0000,51
2015-01-05T00:00:00.000+0000,54


In [0]:
# join + drop col
df3 = df1.join(df2, df1["date"] == df2["date2"]).drop("date2")

In [0]:
# show top 5 rows
display(df3.head(5))


date,low_temp,high_temp
2015-01-01T00:00:00.000+0000,26,42
2015-01-02T00:00:00.000+0000,32,42
2015-01-03T00:00:00.000+0000,35,41
2015-01-04T00:00:00.000+0000,38,51
2015-01-05T00:00:00.000+0000,49,54


In [0]:
# Number of partitions
df3.rdd.getNumPartitions()

In [0]:
# Write out csv file
dst_path = "/lake/bronze/weather/temp"
(
  df3.repartition(1).write
    .format("csv")
    .mode("overwrite")
    .save(dst_path)
)

In [0]:
# show results
dbutils.fs.ls("/lake/bronze/weather/temp")

In [0]:
# create single file
unwanted_file_cleanup("/lake/bronze/weather/temp/", "/lake/bronze/weather/temperature-data.csv", "csv")

In [0]:
dbutils.fs.ls("/lake/bronze/weather/")

In [0]:
#
# 3 - Read + write csv data - schema defined
#


In [0]:
# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
# https://vincent.doba.fr/posts/20211004_spark_data_description_language_for_defining_spark_schema/

# define DDL
schema = "_c0 INTEGER, carat DOUBLE, cut STRING, color STRING, clarity STRING, depth DOUBLE, table DOUBLE, price INTEGER, x DOUBLE, y DOUBLE, z DOUBLE" 

# specify path
path = "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv"

# read in file
df = (spark.read.format("csv").option("header", "true").schema(schema).load(path))


In [0]:
# show top 5 rows
display(df.head(5))

_c0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [0]:
# Write out csv file
dst_path = "/lake/bronze/diamonds/temp"
(
  df.repartition(1).write
    .format("parquet")
    .mode("overwrite")
    .save(dst_path)
)

In [0]:
# create single file
unwanted_file_cleanup("/lake/bronze/diamonds/temp/", "/lake/bronze/diamonds/diamonds-data.parquet", "parquet")

In [0]:
dbutils.fs.ls("/lake/bronze/diamonds/")

In [0]:
#
# 4 - Read + write parquet file format
#


In [0]:
# Show files
lst = dbutils.fs.ls("/databricks-datasets/amazon/test4K")
print(lst, "\n\n", len(lst))


In [0]:
# read in amazon product data
path4 = "/databricks-datasets/amazon/test4K"
df4 = (
  spark.read                    
  .parquet(path4)               
)

In [0]:
display(df4.head(500))

asin,brand,helpful,img,price,rating,review,time,title,user
B00014JKG0,Nature's Gate,"List(0, 0)",http://ecx.images-amazon.com/images/I/31RIumF2GIL._SY300_.jpg,10.02,5.0,"I bought this for my gf and she loves it. Unlike other lotions you don't need several coats for dry shin and it smells nice too. The oils leave her skin smooth for a whole day not just an hr or 2. I'd recommend this to anyone with any skin type, you can't go wrong with this stuff and I'll be ordering more when it runs out.",1339372800,"Nature's Gate Tea Tree Moisturizing Lotion for Irritated, Distressed Skin, 18-Ounce Pump Bottle",A1E57U9XDQCSKC
B000L596FE,Luxor Pro,"List(0, 0)",http://ecx.images-amazon.com/images/I/41xj70ovtmL._SY300_.jpg,8.97,5.0,My 5 yr old wakes up with terribly knotted hair & this brush glides right through it. It truly is some kind of magic! I'd love to give these brushes out as party favors for her birthday :-),1388620800,"The Wet Brush Detangling Shower Brush, Colors Vary",AO95JTORR64XZ
B008FXKOI2,Greenies,"List(2, 2)",http://ecx.images-amazon.com/images/I/51ocmMt8tfL._SY300_.jpg,4.72,4.0,"A healthier choice than the Friskies line for kitty crunchies. My vet uses this brand, which is why I switched.I alternate between the chicken skin & fur and the salmon hairball control, depending on what kind of dry food currently in use. These smartbites are easier for my cat to chew than the dental ones, though I will use the dental every other month or so.Only negative: the volume seems less than from a year ago on the smartbites, but that's in the brick & mortar stores, also.",1385856000,"Greenies Smartbites Cat Treats, 2.1-Ounce",AKLVHFSQRFQL7
B000UJW676,Nelson,"List(2, 2)",http://ecx.images-amazon.com/images/I/51M2elTqCpL._SY300_.jpg,6.39,1.0,"Item fell short of expectations, it did not fit my 3/4"" hose as it stated it would and that is why I purchased it",1376092800,Nelson Brass/Metal Hose Repair Clamp Connector Female 50521,A13KOZOGKOF5R4
B004TK0IG8,Stanley,"List(0, 0)",http://ecx.images-amazon.com/images/I/41OpTlmDsKL._SX300_.jpg,99.99,4.0,"It took much work to flatten the backs of the larger (1"" and 3/4"") chisels. The 1"" chisel has a major grinding flaw near the socket, but not worth returning, since I only needed to flatten the first inch or so to get a good polish. These stay sharp longer than my Irwin chisels, but no where near as long as my Veritas chisels, but the price was right for the set. Once flattened and honed, they are a pleasure to use. These chisels are well balanced easy to hold with just the fingers.I can't compare them to L.N. since I don't own any. Great chisels for a serious hobbyist.",1355443200,Stanley 16-791 Sweetheart 750 Series Socket Chisel 4 Piece Set,A19RQK364II90W
B00D12OBEU,MSI,"List(1, 1)",http://ecx.images-amazon.com/images/I/51X0FrnHE6L._SX300_.jpg,150.23,2.0,"I have one sata HDD. I bought an SSD and my motherboard sometimes doesn't recognize it, only the first HDD. Today I bought one optical drive and I've gotten the same problem.In brief, this motherboard does not work good with two sata devices.If I would had known this I would have bought an asus rampagePD: I updated bios and nothing.",1393632000,MSI Computer Corp. Motherboard ATX DDR3 1333 LGA 1150 Motherboards Z87-G45 GAMING,A30XSY2ZUDF9S9
B00121XRFQ,Maximatic,"List(6, 8)",http://ecx.images-amazon.com/images/I/417lsJA%2BknL._SX300_.jpg,68.88,1.0,After my TSM grinder broke because of a plastic gear housing that was not repairable I bought this- it is not in the same league and I would not recommend it for anything. The stainless turned to dull gray after several washings- it stains your hands and gets on everything including the meat. I ground 5# of pork and beef today and it took 2x as long as the TSM #8. There are no easily available accessories or replacement parts like plates or knives. Pay $30 more and get something better.,1297468800,"MaxiMatic HA-3433A Elite Platinum 550-Watt Stainless Finish Meat Grinder, Gray",A2U7961Z040GLF
B000CCYPAM,Hanes,"List(0, 0)",http://ecx.images-amazon.com/images/I/51NWQdoWeYL._SX342_.jpg,6.52,5.0,loved them...... great for the price and they are not cheap and deserved it. maybe will buy more a a,1360627200,Hanes Men's No Show White with Grey Heel and Toe Sock,A16YXYSIIS6ZBI
B0087T38AK,SunTime,"List(1, 1)",http://ecx.images-amazon.com/images/I/412hflQzHjL._SY300_.jpg,85.95,5.0,This watch was a gift. The watch is extremely nice and looks great. Great company and high quality product. True description of the watch.,1361577600,Georgia Bulldogs Logo Gunmetal Black FantomSportTM Watch,A3DHC1MI6FG0T7
B0006IOTJO,Empowered Products,"List(1, 2)",http://ecx.images-amazon.com/images/I/41wRw5F-XeL._SY300_.jpg,7.6,4.0,I suffer from post partum/nursing dryness and this has worked better than what we were able to find locally. It does build up heat though-we find adding water helps (using a spray bottle).,1220400000,Gun Oil Silicone Lubricant For Men - 4oz,A3B7RERZSD9H3F


In [0]:
# reduce columns
df5 = df4.select("asin", "brand", "price", "rating")

In [0]:
# Write out csv file
dst_path = "/lake/bronze/amazon/temp"
(
  df5.repartition(1).write
    .format("parquet")
    .mode("overwrite")
    .save(dst_path)
)

In [0]:
# create single file
unwanted_file_cleanup("/lake/bronze/amazon/temp/", "/lake/bronze/amazon/product-data.parquet", "parquet")

In [0]:
dbutils.fs.ls("/lake/bronze/amazon/")

In [0]:
#
# 5 - Read + write tsv file format
#


In [0]:

# Data types

# Define schema
src_schema = "AT FLOAT, V FLOAT, AP FLOAT, RH FLOAT, PE FLOAT"
src_path = "/databricks-datasets/power-plant/data/*.tsv"

df6 = (spark.read 
  .format("csv") 
  .schema(src_schema)
  .option("header", "true") 
  .option("delimiter", "\t")
  .option("quote", "")
  .load(src_path)  
  .repartition(1)
  )
display(df6)

AT,V,AP,RH,PE
14.96,41.76,1024.07,73.17,463.26
25.18,62.96,1020.04,59.08,444.37
5.11,39.4,1012.16,92.14,488.56
20.86,57.32,1010.24,76.64,446.48
10.82,37.5,1009.23,96.62,473.9
26.27,59.44,1012.23,58.77,443.67
15.89,43.96,1014.02,75.24,467.35
9.48,44.71,1019.12,66.43,478.42
14.64,45.0,1021.78,41.25,475.98
11.74,43.56,1015.14,70.72,477.5


In [0]:
# Write out csv file
path = "/lake/bronze/power/temp"
(
  df6.repartition(1).write
    .format("parquet")
    .mode("overwrite")
    .save(path)
)

In [0]:
# create single file
unwanted_file_cleanup("/lake/bronze/power/temp/", "/lake/bronze/power/plant-data.parquet", "parquet")

In [0]:
dbutils.fs.ls("/lake/bronze/power/")

In [0]:
#
#  Support file formats
#

# https://spark.apache.org/docs/latest/sql-data-sources.html

# AVRO
# CSV
# JSON
# PARQUET
# TEXT
# JDBC
