-sandbox
<img src= "/files/tables/avatar.jpg" width="100" height="100" />
 
```

Name:         01-read-n-write-files

Design Phase:
    Author:   John Miner
    Date:     12-15-2020
    Purpose:  Reading and writing files.
    
Assumptions:
    None
    
Algorithm:
    1 - Show sample dataset directory.
    2 - Show power plant dataset directory.
    3 - Show content of the 1st tsv file.
    4 - Display the readme file for the power plant sample.
    5 - Read all 4 tsv files into a dataframe.
    6 - Make new directory.
    7 - Display recount count.
    8 - Write dataframe to new directory.
    9 - Display output results.
    
    
```

In [0]:
#
# 1 - Sample data sets - all
#


In [0]:
%fs
ls /databricks-datasets

path,name,size
dbfs:/databricks-datasets/COVID/,COVID/,0
dbfs:/databricks-datasets/README.md,README.md,976
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359
dbfs:/databricks-datasets/adult/,adult/,0
dbfs:/databricks-datasets/airlines/,airlines/,0
dbfs:/databricks-datasets/amazon/,amazon/,0
dbfs:/databricks-datasets/asa/,asa/,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0


In [0]:
%sh

ls /dbfs/databricks-datasets
# ls /dbfs/

In [0]:
#
# 2 - Sample data sets - power plant
#

In [0]:
%sh
ls /dbfs/databricks-datasets/power-plant/data

In [0]:
#
# 3 - Concatenate files & print output
#

In [0]:
%%bash
cat /dbfs/databricks-datasets/power-plant/data/Sheet1.tsv

In [0]:
#
# 4 - View power plant readme
#

path = "/dbfs/databricks-datasets/power-plant/README.md"
with open(path) as file:
  lines = ''.join(file.readlines())
print(lines)

In [0]:
#
# - Dataset information from uci
#

info = """

Attribute Information:

Features consist of hourly average ambient variables
- Temperature (T) in the range 1.81°C and 37.11°C,
- Ambient Pressure (AP) in the range 992.89-1033.30 milibar,
- Relative Humidity (RH) in the range 25.56% to 100.16%
- Exhaust Vacuum (V) in teh range 25.36-81.56 cm Hg
- Net hourly electrical energy output (EP) 420.26-495.76 MW

The averages are taken from various sensors located around the plant that record the ambient variables every second. 
The variables are given without normalization.

"""


In [0]:
#
# 5 - Read 4 files into one dataframe
# 

src_path = "/databricks-datasets/power-plant/data/*.tsv"
df_plant = (spark.read 
  .format("csv") 
  .option("header", "true") 
  .option("delimiter", "\t")
  .option("quote", "")
  .option("inferSchema", "true")
  .load(src_path)  
  .repartition(2)
  )
display(df_plant)

AT,V,AP,RH,PE
22.92,61.9,1013.27,78.32,446.08
20.07,51.3,1011.9,94.62,450.46
18.63,61.27,1019.69,73.67,454.03
21.22,59.15,1014.68,59.71,450.96
24.03,58.46,1015.71,70.15,442.03
28.4,75.23,1011.12,55.01,441.78
12.27,41.17,1019.39,52.18,473.84
11.42,40.43,1025.5,75.85,490.02
23.02,69.94,1007.94,64.66,439.4
20.11,51.19,1007.82,92.06,449.03


In [0]:
#
# 6 - Make new directories
#

# force remove datalake
dbutils.fs.rm("/rissug", recurse=True)

# simulate datalake quality levels
dbutils.fs.mkdirs("/rissug")
dbutils.fs.mkdirs("/rissug/bronze")
dbutils.fs.mkdirs("/rissug/silver")
dbutils.fs.mkdirs("/rissug/gold")


In [0]:
#
# Remove from bundle
#

dbutils.fs.ls("/rissug/bronze")
# dbutils.fs.rm("/rissug/bronze/power-plant", recurse=True)

In [0]:
#
# 7 - Show record count
#

print("Total number of data points in set is {}.".format(df_plant.count()))


In [0]:
#
# 8 - Write file to new directory
#

# Write out parquet files
dst_path = "/rissug/bronze/power-plant/"
(
  df_plant
    .write
    .format("parquet")
    .mode("overwrite")
    .save(dst_path)
)

In [0]:
#
# 9 - List directory contents
#

# Sample datalake
# dbutils.fs.ls("/rissug")

# Current dataset
dbutils.fs.ls("/rissug/bronze/power-plant/")

In [0]:
#
#  Support file formats
#

# https://spark.apache.org/docs/latest/sql-data-sources.html

# AVRO
# CSV
# JSON
# PARQUET
# TEXT
# JDBC
