In [2]:
#******************************************************
#*
#* Name:         nb-06-make-apache-files
#*     
#* Design Phase:
#*     Author:   John Miner
#*     Date:     12-04-2024
#*     Purpose:  Shortcut vs dataframe commands.
#*               Managed vs unmanaged tables.
#* 
#******************************************************/


StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 4, Finished, Available, Finished)

In [3]:
#
#  1 - Define function to find matching files
# 

# import libraries
import fnmatch

# define function
def get_file_list(path_txt, pattern_txt):
  
  # list of file info objects
  fs_lst = mssparkutils.fs.ls(path_txt)
  
  # create list of file names
  dir_lst = list()
  for f in fs_lst:
      dir_lst.append(f.name)
      
  # filter file names by pattern
  files_lst = fnmatch.filter(dir_lst, pattern_txt)
  
  # return list
  return(files_lst)

StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 5, Finished, Available, Finished)

In [4]:
# 
#  2 - Keep only the single delimited file
#

# Define function
def unwanted_file_cleanup(folder_name, file_name, file_ext):
  try:
    
    # define tmp dir
    tmp_dir = folder_name
    
    # find new file
    tmp_lst = get_file_list(tmp_dir, "part*." + file_ext)    
    tmpfile_txt = tmp_dir + "/" + tmp_lst[0]

    # remove old file
    try:
        mssparkutils.fs.rm(file_name, recurse=False)
    except:
        pass
    
    # copy new file
    try:
        mssparkutils.fs.cp(tmpfile_txt, file_name)
    except:
        pass
    
    # remove tmp dir, clean up step
    mssparkutils.fs.rm(tmp_dir, recurse=True)
    
    # success
    return True
  
  except Exception as err:
    raise(err)

StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 6, Finished, Available, Finished)

In [5]:
#
#  3 - read in parquet, write out avro
# 

from pyspark.sql.functions import *

# define path
path1 = "Files/Stocks/all_stock_data.parquet"

# read in parquet file
df1 = (
  spark.read.parquet(path1)              
)

# define path
path2 = "Files/Stocks/Avro"

# rename bad column
df1 = df1.withColumnRenamed('Stock Splits', 'StockSplits')

# add calc column 1
df1 = df1.withColumn("Year", year(col("Date")))

# add calc column 2
df1 = df1.withColumn("Month", month(col("Date")))

# write folder/file
df1.repartition(1).write.format("avro").mode("overwrite").save(path2)



StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 7, Finished, Available, Finished)

In [6]:
#
#  4 - create file instead of directory
#

unwanted_file_cleanup("Files/Stocks/Avro", "Files/Stocks/all_stock_data.avro", "avro")


StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 8, Finished, Available, Finished)

True

In [7]:
#
#  5 - read in parquet, write out orc
# 

from pyspark.sql.functions import *

# define path
path1 = "Files/Stocks/all_stock_data.parquet"

# read in parquet file
df1 = (
  spark.read.parquet(path1)              
)

# define path
path2 = "Files/Stocks/Orc"

# rename bad column
df1 = df1.withColumnRenamed('Stock Splits', 'StockSplits')

# add calc column 1
df1 = df1.withColumn("Year", year(col("Date")))

# add calc column 2
df1 = df1.withColumn("Month", month(col("Date")))

# write folder/file
df1.repartition(1).write.format("orc").mode("overwrite").save(path2)


StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 9, Finished, Available, Finished)

In [8]:
#
#  6 - create file instead of directory
#

unwanted_file_cleanup("Files/Stocks/Orc", "Files/Stocks/all_stock_data.orc", "orc")

StatementMeta(, 9d390e71-1418-45b2-bb14-7326821c8d3b, 10, Finished, Available, Finished)

True

In [2]:
#
#  7 - review file sizes
#

files = mssparkutils.fs.ls("Files/Stocks")
for file in files:
    print(f"{file.name} - {file.size / 1000000000}")


StatementMeta(, ac04551b-4cc8-4198-a8f2-0ca154560563, 4, Finished, Available, Finished)

all_stock_data.avro - 1.247383213
all_stock_data.csv - 3.512208588
all_stock_data.orc - 1.096080995
all_stock_data.parquet - 0.962269317
archive.zip - 1.918054636
