In [None]:
# 1) Gets the username and password of an account with full access(read/write) to the SharePoint site. This username and password is saved in Azure key vault.

sharepoint_usr = dbutils.secrets.get(scope = 'Key-Vault-Secret', key = 'SharePoint-Acc')
sharepoint_pwd = dbutils.secrets.get(scope = 'Key-Vault-Secret', key = 'SharePoint-PWD')

In [None]:
# 2) Connects to the SharePoint site, read the csv file and get the csv file into a pandas dataframe

# Install sharepy library seperately to the cluster used
# json and pandas libraries are by default available in the cluster, so no need to install seperately
import sharepy
import json
import pandas as pd

# Authenticate
s = sharepy.connect('<enter_sharepoint_site_url>',username=sharepoint_usr, password=sharepoint_pwd)

file_url = '<enter_csv_filepath_in_sharepoint_site>'

filename = 'Customers.csv'

r = s.getfile(file_url, filename=filename)
df = pd.read_csv(filename)
df = spark.createDataFrame(df.astype(str))

In [None]:
# 3) Prints the contents of the dataframe

display(df)

CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY
Atelier graphique,40.32.2555,"54, rue Royale",,Nantes,,44000,France
Mini Caravy,88.60.1555,"24, place Kluber",,Strasbourg,,67000,France
Online Diecast Creations Co.,6035558647,2304 Long Airport Avenue,,Nashua,NH,62005,USA
Signal Gift Stores,7025551838,8489 Strong St.,,Las Vegas,NV,83030,USA
"Vida Sport, Ltd",0897-034555,Grenzacherweg 237,,Gensve,,1203,Switzerland
"Toms Spezialitten, Ltd",0221-5554327,Mehrheimerstr. 369,,Koln,,50739,Germany
Marseille Mini Autos,91.24.4555,"12, rue des Bouchers",,Marseille,,13008,France
Signal Collectibles Ltd.,4155554312,2793 Furth Circle,,Brisbane,CA,94217,USA
"Australian Gift Network, Co",61-7-3844-6555,31 Duncan St. West End,,South Brisbane,Queensland,4101,Australia
Mini Wheels Co.,6505555787,5557 North Pendale Street,,San Francisco,CA,,USA


In [None]:
# 4) Dbutils widget created to get the folder path to save the parquet file in the data lake. A value can be passed as a parameter for this widget.

# path - dimension/customer
dbutils.widgets.text('path', '')
path = dbutils.widgets.get('path')
print(path)




In [None]:
# 5) Writes the dataframe in parquet format to the data lake

# Access key used to connect to the data lake
# This value is saved in the Azure key vault
str_key = dbutils.secrets.get(scope = 'Key-Vault-Secret', key = 'Data-Lake')

spark.conf.set(
  'fs.azure.account.key.heshtestdwhstorage.dfs.core.windows.net',
  str_key
)

# Data lake folder path where we want to write the parquet file
target_folder_path = 'abfss://data-lake@heshtestdwhstorage.dfs.core.windows.net/' + path

df.coalesce(1).write.mode('overwrite').option('header','false').parquet(target_folder_path)