In [9]:
#default_exp wrapper.azurewrapper

In [10]:
#hide
from nbdev.showdoc import *
import numpy as np
import pandas as pd

In [11]:
#export
from sdsde.azure.filehandling import FileHandling

import os
import numpy as np
import logging


logging.basicConfig(level=logging.INFO)
logging.getLogger("azure.core").setLevel(logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

# Azure Wrappers

## blob_pusher

In [12]:
#export


def blob_pusher(container_name: str,
                connection_str: str,
                file_path: list = None,
                blob_dest: list = None,
                **kwargs):
    """
    function that will push file(s) to azure blob

    Args:
    * container_name (str): container name
    * connection_str (str): connection str
    * file_path (list, optional): file location of file(s). Defaults to None.
    * blob_dest (list, optional): where to drop in azure blob. Defaults to container_name.

    Returns:
        str: file_path
    """
    fh = FileHandling(connection_str)
    blob_dest = [container_name] if blob_dest is None else blob_dest
    if len(blob_dest) != len(file_path):
        for f in file_path:
            fh.upload(container_name=container_name,
                      file_path=f,
                      dest=os.path.join(blob_dest[0], f.split('/')[-1]),
                      **kwargs)
    else:
        for f, p in zip(file_path, blob_dest):
            fh.upload(container_name=container_name,
                      file_path=f,
                      dest=str(os.path.join(p, f.split('/')[-1])),
                      **kwargs)
    return file_path

Use case for this function is to be able to push multi files to different locations in blob. An example would be the data prep needs to one location and the data set should go to another a good example of this is inside of the ``ML_Reservation`` repo that uses this function in action

In [13]:
show_doc(blob_pusher)

<h4 id="blob_pusher" class="doc_header"><code>blob_pusher</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>blob_pusher</code>(**`container_name`**:`str`, **`connection_str`**:`str`, **`file_path`**:`list`=*`None`*, **`blob_dest`**:`list`=*`None`*, **\*\*`kwargs`**)

function that will push file(s) to azure blob

Args:
* container_name (str): container name
* connection_str (str): connection str
* file_path (list, optional): file location of file(s). Defaults to None.
* blob_dest (list, optional): where to drop in azure blob. Defaults to container_name.

Returns:
    str: file_path

In [14]:
data_loaders = ['testing/test_df.csv']
container_name = 'sdsdetesting'
blob_pusher(file_path=data_loaders,
            container_name=container_name,
            blob_dest=['snowflake_load_test'],
            connection_str=os.environ['connection_str'])
from sdsde.azure.filehandling import *
fh = FileHandling(os.environ['connection_str'])
assert fh.ls_blob(container_name=container_name, path='snowflake_load_test') == ['test_df.csv'],' File should have made it'

INFO:sdsde.azure.filehandling:sdsdetesting is a valid
INFO:sdsde.azure.filehandling:ContainerAlreadyExists
INFO:sdsde.azure.filehandling:Uploading testing/test_df.csv, to to Azure Storage snowflake_load_test/test_df.csv
ERROR:sdsde.azure.filehandling:"Error Message: BlobAlreadyExists"
INFO:sdsde.azure.filehandling:Azure Upload Complete


> Note: urllib3 warning is a known bug from the community and it's has to to the the HTTP response the juice isn't worth the squeeze if it bothers people sdsde can suppress the warnings from urllib3

## blob_puller

In [15]:
#export


def blob_puller(files: list,
                connection_str: str,
                container_name: str,
                drop_location: str = '.',
                **kwargs):
    """
    Can pull a list or one file from azure

    Args:
    * files (list): list of files or a file wrapped in []
    * connection_str (str): connection string to azure blob storage
    * container_name (str): container name
    * drop_location (str, optional): where to drop file(s) locally. Defaults to ''.
    """
    fh = FileHandling(connection_str)
    drop_location = drop_location if drop_location.endswith('/') else drop_location + '/'
    for f in files:
        fh.download_file(container_name=container_name,
                         file=f,
                         file_path=drop_location,
                         **kwargs)

The use case of this function is to be able to pull multi files down at a time that might be needs for say a train, valid, test set that is sitting in the same container and there is a desire to have them all be pulled down to a computer.

> TODO: Add container locations as well as adding different drop locations similar to ``blob_pusher``

In [16]:
show_doc(blob_puller)

<h4 id="blob_puller" class="doc_header"><code>blob_puller</code><a href="__main__.py#L4" class="source_link" style="float:right">[source]</a></h4>

> <code>blob_puller</code>(**`files`**:`list`, **`connection_str`**:`str`, **`container_name`**:`str`, **`drop_location`**:`str`=*`'.'`*, **\*\*`kwargs`**)

Can pull a list or one file from azure

Args:
* files (list): list of files or a file wrapped in []
* connection_str (str): connection string to azure blob storage
* container_name (str): container name
* drop_location (str, optional): where to drop file(s) locally. Defaults to ''.

In [17]:
container_name = 'sdsdetesting'
blob_puller(files=['snowflake_load_test/test_df.csv'],
            connection_str=os.environ['connection_str'],
            container_name=container_name,
            drop_location='./testing',
            overwrite=True)
assert os.path.exists(data_loaders[0]) == True, 'above function did not run'
fh.rm_files(container_name=container_name, delete_path='snowflake_load_test/', recursive=True)

INFO:sdsde.azure.filehandling:snowflake_load_test/test_df.csv to ./testing/test_df.csv
INFO:sdsde.azure.filehandling:files to be removed ['snowflake_load_test/test_df.csv']


## unlink_files

Use case is used for cleaning your files up after a function is ran **s/o Clay Elmore** for making this something we do as a sdsde team and now has a supported function for this exact use case. This is used everywhere is the repos that Jeremy and Caly develop and by the time you read this I am sure the rest of the team will be doing this as well.

In [18]:
#export


def unlink_files(files: list, file_path: str = './'):
    """
    File Clean Up After Model Prediction

    Args:
    * files (list): file(s) name(s)
    * file_path (str, optional): file(s) path(s). Defaults to './'.
    """
    file_list = files
    for x in file_list:
        os.unlink(os.path.join(file_path, x))

In [19]:
dict1 = [{'ecid': 150, 'home': 'CA', 'avg_visits': 0.20, 'LTR': 6},
         {'ecid': 151, 'home': 'LA', 'avg_visits': 10, 'LTR': 2},
         {'ecid': 160, 'home': 'CO', 'avg_visits': 0.56, 'LTR': 4},
         {'ecid': 100, 'home': 'LA', 'avg_visits': 2.0, 'LTR': 3}]
df = pd.DataFrame(dict1)
df.to_csv('df_file.csv')
numpy_save = np.arange(10)
np.save('np_file.npy', numpy_save)
df_names = ['df_file.csv']
np_names = ['np_file.npy']

In [20]:
assert os.path.exists(df_names[0]) == True, 'above function did not run'
assert os.path.exists(np_names[0]) == True, 'above function did not run'
unlink_files(df_names + np_names)
assert os.path.exists(df_names[0]) == False, 'file name change?'
assert os.path.exists(np_names[0]) == False, 'file name change?'

## save_and_push_data

A use case for this is when you have a lot of in memory python object for example ``np.arrays`` ``pd.DataFrame`` or ``dict`` and you want to push these to a certain location in an azure blob container

In [21]:
#export


def save_and_push_data(container_name: str,
                       connection_str: str,
                       df_names: list = None,
                       dfs: list = None,
                       np_names: list = None,
                       nps: list = None,
                       blob_dest: str = None,
                       parquet: bool = True,
                       **kwargs):
    """
    Takes panda dataframes and wirtes them to parquet files
    Takes numpy arrays, list, and dictionaries and writes them out
    as numpy files.

    Note: to get the dictionary out upon load you need to add a .item()
    this will return the dict as a not np array.

    Args:
    * container_name (str): location in blob storage
    * connection_str (str): connect_str for azure
    * df_names (list, optional): list of names for the files. Defaults to [].
    * dfs (list, optional): list of panda dataframes. Defaults to [].
    * np_names (list, optional): list of names for the files. Defaults to [].
    * nps (list, optional): list of numpy arrays to write out. Defaults to [].
    * blob_folder (str, optional): folder you would like. Defaults to None.
    * parquet (bool): true means save df as parquet files. Defaults to True.
    """
#     Once again Snowflake Parquet upload isn't easy will figure out
    if parquet is True:
        _ = [d.to_parquet(f"{n}") for d, n in zip(dfs, df_names)]
    else:
        _ = [d.to_csv(f"{n}") for d, n in zip(dfs, df_names)]
    _ = [np.save(f'{n}', d) for d, n in zip(nps, np_names)]
    files_list = np.concatenate([df_names, np_names]).tolist()
    _ = blob_pusher(container_name=container_name,
                    connection_str=connection_str,
                    file_path=files_list,
                    blob_dest=blob_dest,
                    **kwargs)
    return files_list

In [22]:
dict1 = [{'ecid': 150, 'home': 'CA', 'avg_visits': 0.20, 'LTR': 6},
         {'ecid': 151, 'home': 'LA', 'avg_visits': 10, 'LTR': 2},
         {'ecid': 160, 'home': 'CO', 'avg_visits': 0.56, 'LTR': 4},
         {'ecid': 100, 'home': 'LA', 'avg_visits': 2.0, 'LTR': 3}]
df = pd.DataFrame(dict1)
df.to_csv('df_file.csv')
numpy_save = np.arange(10)
np.save('np_file.npy', numpy_save)
df_names = ['df_file.csv']
np_names = ['np_file.npy']

save_and_push_data(container_name=container_name,
                   df_names=df_names,
                   dfs=[df],
                   np_names=np_names,
                   nps=[numpy_save],
                   blob_dest=['save_and_push'],
                   connection_str=os.environ['connection_str'],
                   parquet=False)

assert fh.ls_blob(container_name=container_name, path='save_and_push') == ['df_file.csv', 'np_file.npy'], 'files sent'
unlink_files(df_names + np_names)

INFO:sdsde.azure.filehandling:sdsdetesting is a valid
INFO:sdsde.azure.filehandling:ContainerAlreadyExists
INFO:sdsde.azure.filehandling:Uploading df_file.csv, to to Azure Storage save_and_push/df_file.csv
INFO:sdsde.azure.filehandling:Azure Upload Complete
INFO:sdsde.azure.filehandling:sdsdetesting is a valid
INFO:sdsde.azure.filehandling:ContainerAlreadyExists
INFO:sdsde.azure.filehandling:Uploading np_file.npy, to to Azure Storage save_and_push/np_file.npy
INFO:sdsde.azure.filehandling:Azure Upload Complete


#  Create

In [23]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted 08_yaml_ingestion_binary_classification.ipynb.
Converted index.ipynb.
