In [1]:
#default_exp snowflake.copyinto2

In [2]:
#hide
%reload_ext autoreload
%autoreload 2
from nbdev import *

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#export
from sdsde import files

import pandas as pd
import sys
import logging
import os


logging.basicConfig(level=logging.INFO)
logging.getLogger("azure.core").setLevel(logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

# ``SF_Copy_Into``

TODO: Think about making a class to wrap these functions together for a simple API call

## ``make_data_lake_stage``

In [5]:
# export
def make_data_lake_stage(stage_name: str,
                         account: str,
                         container: str,
                         data_lake_path: str,
                         sas_token: str,
                         file_type: str,
                         compression: str = None,
                         field_delimiter: str = None,
                         field_optionally_enclosed_by: str = None,
                         encoding: str = None):
    """
    creates a data lake staging environment from snowflake this calls ``stage_query_generator``
    which does the manipulation to the sdsde file that has the options currently available be
    sure to rip this whole file out if there is something that you need to add before it can
    be a request add to sdsde.

    how to use:

    ```python
    stage_query = make_data_lake_stage(sf_connection=sf,
                                       stage_name='sdsdestage_test',
                                       account=os.environ['azure_account'],
                                       container='sdsdetesting',
                                       data_lake_path='testing_stage/',
                                       field_delimiter=r",",
                                       compression='None',
                                       encoding='UTF-8',
                                       sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                       file_type='csv'
                                       )
    sf.run_str_query(stage_query)

    ```

    Args:
    * stage_name (str): name of stage in snowflake
    * account (str): blob storage account
    * container (str): blob container
    * data_lake_path (str): path in the container to stage in
    * sas_token (str): shared access token for blob
    * file_type (str): for most use cases csv has been used but parquet and others can be used
    * compression (str): the file type compression None if you want the raw file type like csv
      AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE
    * encoding (str): file encoding method used to parse files on snowflakes side
    * field_delimiter (str): file type deliminter like ; or /t
    """
    stage_url = f'azure://{account}.blob.core.windows.net/{container}/{data_lake_path}'
    logger.info(f'Datalake Stage path that copy into will use {data_lake_path}')
    stage_query = stage_query_generator(stage_name, stage_url, sas_token, field_delimiter,
                                        encoding, compression, file_type=file_type,
                                        field_optionally_enclosed_by=field_optionally_enclosed_by)
    logger.info(f"stage_query: \n {stage_query.replace(sas_token, '**MASKED**')}")
    return stage_query

In [6]:
show_doc(make_data_lake_stage)

<h4 id="make_data_lake_stage" class="doc_header"><code>make_data_lake_stage</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>make_data_lake_stage</code>(**`stage_name`**:`str`, **`account`**:`str`, **`container`**:`str`, **`data_lake_path`**:`str`, **`sas_token`**:`str`, **`file_type`**:`str`, **`compression`**:`str`=*`None`*, **`field_delimiter`**:`str`=*`None`*, **`field_optionally_enclosed_by`**:`str`=*`None`*, **`encoding`**:`str`=*`None`*)

creates a data lake staging environment from snowflake this calls ``stage_query_generator``
which does the manipulation to the sdsde file that has the options currently available be
sure to rip this whole file out if there is something that you need to add before it can
be a request add to sdsde.

how to use:

```python
stage_query = make_data_lake_stage(sf_connection=sf,
                                   stage_name='sdsdestage_test',
                                   account=os.environ['azure_account'],
                                   container='sdsdetesting',
                                   data_lake_path='testing_stage/',
                                   field_delimiter=r",",
                                   compression='None',
                                   encoding='UTF-8',
                                   sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                   file_type='csv'
                                   )
sf.run_str_query(stage_query)

```

Args:
* stage_name (str): name of stage in snowflake
* account (str): blob storage account
* container (str): blob container
* data_lake_path (str): path in the container to stage in
* sas_token (str): shared access token for blob
* file_type (str): for most use cases csv has been used but parquet and others can be used
* compression (str): the file type compression None if you want the raw file type like csv
  AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE
* encoding (str): file encoding method used to parse files on snowflakes side
* field_delimiter (str): file type deliminter like ; or /t

### ``stage_query_generator``

In [7]:
# export
def stage_query_generator(stage_name: str,
                          url: str,
                          sas_token: str,
                          field_delimiter: str,
                          encoding: str,
                          compression: str,
                          file_type: str,
                          field_optionally_enclosed_by: str):
    """
    generates the snowflake query needed to create an external stage in
    azure blob this is inside of ``make_data_lake_stage`` that is the only
    due to the vars() that makes this a little simpler and more robost for this
    use case.

    TODO: figure out string manipulation inside of a list comp, but is not supported
    in python 3.8 and figure out a better way to have the chained replace calls

    Args:
    * stage_name (str): name of the stage in snowflake
    * url (str): azure formated string for account, container, and path
    * sas_token (str): blob sas token for shared access
    * field_delimiter (str): file type deliminter like ; or /t
    * encoding (str): file encoding method used to parse files on snowflakes side
    * compression (str): the file type compression None if you want the raw file type like csv
    AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE
    * file_type (str, optional): type of files expected in stage. Defaults to 'parquet'. Can use 'csv' as well.

    Returns:
    * str: snowflake query to create stage
    """
    values = vars()
    with open(os.path.join(files.__path__[0], 'stage_template.sql'), 'r') as f:
        lines = f.read()
        f.close()
    for k, v in values.items():
        if v is not None:
            lines = lines.replace(f'<{k.upper()}>', v)
        else:
            lines = lines.replace(f"'<{k.upper()}>'", '').replace(f"{k} =", '').replace(f"<{k.upper()}>", '')
    return lines

In [8]:
show_doc(stage_query_generator)

<h4 id="stage_query_generator" class="doc_header"><code>stage_query_generator</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>stage_query_generator</code>(**`stage_name`**:`str`, **`url`**:`str`, **`sas_token`**:`str`, **`field_delimiter`**:`str`, **`encoding`**:`str`, **`compression`**:`str`, **`file_type`**:`str`, **`field_optionally_enclosed_by`**:`str`)

generates the snowflake query needed to create an external stage in
azure blob this is inside of ``make_data_lake_stage`` that is the only
due to the vars() that makes this a little simpler and more robost for this
use case.

TODO: figure out string manipulation inside of a list comp, but is not supported
in python 3.8 and figure out a better way to have the chained replace calls

Args:
* stage_name (str): name of the stage in snowflake
* url (str): azure formated string for account, container, and path
* sas_token (str): blob sas token for shared access
* field_delimiter (str): file type deliminter like ; or /t
* encoding (str): file encoding method used to parse files on snowflakes side
* compression (str): the file type compression None if you want the raw file type like csv
AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE
* file_type (str, optional): type of files expected in stage. Defaults to 'parquet'. Can use 'csv' as well.

Returns:
* str: snowflake query to create stage

## ``copy_into_adls_query_generator``

In [9]:
#export
def copy_into_adls_query_generator(stage_name: str = None,
                                   azure_sas_token: str = None,
                                   data_lake_path: str = None,
                                   azure_path: str = None,
                                   sf_query: str = None,
                                   table_name: str = None,
                                   field_delimiter: str = None,
                                   partition_by: str = None,
                                   max_file_size: str = None,
                                   header: str = None,
                                   encoding: str = None,
                                   file_type: str = None,
                                   field_optionally_enclosed_by: str = None,
                                   skip_header: str = None,
                                   compression: str = None,
                                   over_write: str = None):
    """
    Generate query to dump snowflake data to an adls stage that has already been created.
    There are a lot of optional arguements to allow the user to have a pleasurable experience
    unlocking more than a user typically needs in our current sdsde stage of technology.

    How To Use:

    Note: that the sf_query could also be sf.execute_file(custom_query) to allow for more complex
    queries to dump to azure for the use case at hand.

    ```python
    sg_query = copy_into_adls_query_generator(stage_name='sdsdestage_test',
                                              azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                              sf_query=r'''SELECT * FROM BIDE_EDWDB_ARA_PROD.dbo.FactScan
                                                       WHERE ECID = 84412913 LIMIT 100''',
                                              data_lake_path='testing_stage/',
                                              max_file_size = '32000',
                                              header='True',
                                              over_write='True')
    sf.run_str_query(sg_query)

    ```
    Args:
    * stage_name (str): name of the stage in snowflake
    * azure_sas_token (str): blob sas token for shared access when used to be able to move to a direct azure location
    * data_lake_path (str, optional): Path inside of the created stage used when stage isn't the direct path
    * azure_path (str, optional): Currently not supported do to not having a storage itergration, but this
    would be a direct azure url path
    * sf_query (str, optional): the sf query to use to dump to adls
    * table_name (str, optional): full table name make sure you add the database.schema unless the sf is initialized
    properly
    * field_delimiter (str, optional): file type deliminter like ; or /t
    * partition_by (str, optional): Currently not tested, but this would allow you to dump file structure to adls in
    a partitioned manner
    * max_file_size (str, optional): this allows snowflake to make files as big as this integer number or as small
    * header (str, optional): True = Give the file columns names | False = Skip header and only dump data
    * encoding (str, optional): file encoding engine to be read. Defaults to None.
    * file_type (str, optional): . Defaults to None.
    * field_optionally_enclosed_by (str, optional): . Defaults to None.
    * skip_header (str, optional): this is to skip rows in a file. Defaults to None.
    * compression (str, optional): what type of compression is the file in. Defaults to None.
    * over_write (str, optional): True = overwrite files that are named the same | False = if file is there fail
    Returns:
    * str: Snowflake Query
    """
    values = vars()
    file_sql = 'copy_into_adls_from_sf_stage.sql' if azure_path is None else 'copy_into_adls_from_sf.sql'
    with open(os.path.join(files.__path__[0], file_sql), 'r') as f:
        lines = f.read()
        f.close()
    lines = lines.replace("type =", '') if file_type is None else lines
    for k, v in values.items():
        if v is not None:
            lines = lines.replace(f'<{k.upper()}>', v)
        else:
            lines = lines.replace(f"<{k.upper()}>", '').replace(f"'<{k.upper()}>'", '').replace(f"{k} =", '').replace(r"''", '')
            lines = lines.replace("partition by = ()", '')
    if azure_path is None:
        logger.info(f'\n{lines}')
    else:
        logger.info(f'\n{lines.replace(azure_sas_token, "**MASKED**")}')
    return lines

In [10]:
show_doc(copy_into_adls_query_generator)

<h4 id="copy_into_adls_query_generator" class="doc_header"><code>copy_into_adls_query_generator</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>copy_into_adls_query_generator</code>(**`stage_name`**:`str`=*`None`*, **`azure_sas_token`**:`str`=*`None`*, **`data_lake_path`**:`str`=*`None`*, **`azure_path`**:`str`=*`None`*, **`sf_query`**:`str`=*`None`*, **`table_name`**:`str`=*`None`*, **`field_delimiter`**:`str`=*`None`*, **`partition_by`**:`str`=*`None`*, **`max_file_size`**:`str`=*`None`*, **`header`**:`str`=*`None`*, **`encoding`**:`str`=*`None`*, **`file_type`**:`str`=*`None`*, **`field_optionally_enclosed_by`**:`str`=*`None`*, **`skip_header`**:`str`=*`None`*, **`compression`**:`str`=*`None`*, **`over_write`**:`str`=*`None`*)

Generate query to dump snowflake data to an adls stage that has already been created.
There are a lot of optional arguements to allow the user to have a pleasurable experience
unlocking more than a user typically needs in our current sdsde stage of technology.

How To Use:

Note: that the sf_query could also be sf.execute_file(custom_query) to allow for more complex
queries to dump to azure for the use case at hand.

```python
sg_query = copy_into_adls_query_generator(stage_name='sdsdestage_test',
                                          azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                          sf_query=r'''SELECT * FROM BIDE_EDWDB_ARA_PROD.dbo.FactScan
                                                   WHERE ECID = 84412913 LIMIT 100''',
                                          data_lake_path='testing_stage/',
                                          max_file_size = '32000',
                                          header='True',
                                          over_write='True')
sf.run_str_query(sg_query)

```
Args:
* stage_name (str): name of the stage in snowflake
* azure_sas_token (str): blob sas token for shared access when used to be able to move to a direct azure location
* data_lake_path (str, optional): Path inside of the created stage used when stage isn't the direct path
* azure_path (str, optional): Currently not supported do to not having a storage itergration, but this
would be a direct azure url path
* sf_query (str, optional): the sf query to use to dump to adls
* table_name (str, optional): full table name make sure you add the database.schema unless the sf is initialized
properly
* field_delimiter (str, optional): file type deliminter like ; or /t
* partition_by (str, optional): Currently not tested, but this would allow you to dump file structure to adls in
a partitioned manner
* max_file_size (str, optional): this allows snowflake to make files as big as this integer number or as small
* header (str, optional): True = Give the file columns names | False = Skip header and only dump data
* encoding (str, optional): file encoding engine to be read. Defaults to None.
* file_type (str, optional): . Defaults to None.
* field_optionally_enclosed_by (str, optional): . Defaults to None.
* skip_header (str, optional): this is to skip rows in a file. Defaults to None.
* compression (str, optional): what type of compression is the file in. Defaults to None.
* over_write (str, optional): True = overwrite files that are named the same | False = if file is there fail
Returns:
* str: Snowflake Query

##  ``copy_into_sf_query_generator``

In [11]:
# export
def copy_into_sf_query_generator(database: str,
                                 schema: str,
                                 table_name: str,
                                 file_type: str,
                                 stage_name: str = None,
                                 data_lake_path: str = None,
                                 azure_path: str = None,
                                 azure_sas_token: str = None,
                                 pattern: str = None,
                                 skip_header: str = None,
                                 compression: str = None,
                                 field_delimiter: str = None,
                                 encoding: str = None):
    """
    Generates query to take data from a snowflake stage and puts the data
    directly into a table requested by this function and only if the table in sf
    doesn't exisit it will fail in the how to will show you how to not have it fail
    out, but the notebook shows how to make a snowflake with the data that is coming
    from adls.

    How to use:

    ```python
    cp_query = copy_into_sf_query_generator(stage_name='sdsdestage_test/',
                                            data_lake_path='testing_stage/',
                                            table_name='sdsde_DELETE_TEST_TABLE',
                                            database=sf.sfDatabase,
                                            schema=sf.sfSchema,
                                            skip_header='1',
                                            field_delimiter=',',
                                            encoding='UTF-8',
                                            file_type='csv',
                                            pattern='.*.csv')
    try:
        sf.run_str_query(cp_query)
    except Exception as e:
        logger.error(f'Error Created Trying to Copy Into sf table {e}')
        logger.warning('Most this table needs to be initialized' )
    ```

    Args:
    * stage_name (str): name of the stage in snowflake
    * data_lake_path (str): Path inside of the created stage used when stage isn't the direct path
    * database (str): snowflake database
    * schema (str): snowflake schema
    * table_name (str): snowflake table name that data will be put into
    * file_type (str): file type that will be ingested
    * pattern (str): either this is grabbing many files or the data_lake_path will be point to one file to ingest
    if there is a patter like .*.csv it will use regex to find files with this regex pattern
    * skip_header (str, optional): during development files with columns failed so skipping the header with column names is needed
    * compression (str, optional): what type of compression is being used for this set of data
    * field_delimiter (str, optional): file type deliminter like ; or /t
    * encoding (str, optional): file encoding method used to parse files on snowflakes side

    Returns:
    * str: snowflake query to create stage
    """
    values = vars()
    file_sql = 'copy_into_sf_table.sql' if azure_path is None else 'copy_into_sf_table_direct.sql'
    with open(os.path.join(files.__path__[0], file_sql), 'r') as f:
        lines = f.read()
        f.close()
    for k, v in values.items():
        if v is not None:
            lines = lines.replace(f'<{k.upper()}>', v)
        else:
            lines = lines.replace(f'<{k.upper()}>', '').replace(f"{k} = ''", '').replace(f"{k} =", '')
    logger.info(f'\n{lines}')
    return lines

In [12]:
show_doc(copy_into_sf_query_generator)

<h4 id="copy_into_sf_query_generator" class="doc_header"><code>copy_into_sf_query_generator</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>copy_into_sf_query_generator</code>(**`database`**:`str`, **`schema`**:`str`, **`table_name`**:`str`, **`file_type`**:`str`, **`stage_name`**:`str`=*`None`*, **`data_lake_path`**:`str`=*`None`*, **`azure_path`**:`str`=*`None`*, **`azure_sas_token`**:`str`=*`None`*, **`pattern`**:`str`=*`None`*, **`skip_header`**:`str`=*`None`*, **`compression`**:`str`=*`None`*, **`field_delimiter`**:`str`=*`None`*, **`encoding`**:`str`=*`None`*)

Generates query to take data from a snowflake stage and puts the data
directly into a table requested by this function and only if the table in sf
doesn't exisit it will fail in the how to will show you how to not have it fail
out, but the notebook shows how to make a snowflake with the data that is coming
from adls.

How to use:

```python
cp_query = copy_into_sf_query_generator(stage_name='sdsdestage_test/',
                                        data_lake_path='testing_stage/',
                                        table_name='sdsde_DELETE_TEST_TABLE',
                                        database=sf.sfDatabase,
                                        schema=sf.sfSchema,
                                        skip_header='1',
                                        field_delimiter=',',
                                        encoding='UTF-8',
                                        file_type='csv',
                                        pattern='.*.csv')
try:
    sf.run_str_query(cp_query)
except Exception as e:
    logger.error(f'Error Created Trying to Copy Into sf table {e}')
    logger.warning('Most this table needs to be initialized' )
```

Args:
* stage_name (str): name of the stage in snowflake
* data_lake_path (str): Path inside of the created stage used when stage isn't the direct path
* database (str): snowflake database
* schema (str): snowflake schema
* table_name (str): snowflake table name that data will be put into
* file_type (str): file type that will be ingested
* pattern (str): either this is grabbing many files or the data_lake_path will be point to one file to ingest
if there is a patter like .*.csv it will use regex to find files with this regex pattern
* skip_header (str, optional): during development files with columns failed so skipping the header with column names is needed
* compression (str, optional): what type of compression is being used for this set of data
* field_delimiter (str, optional): file type deliminter like ; or /t
* encoding (str, optional): file encoding method used to parse files on snowflakes side

Returns:
* str: snowflake query to create stage

## ``parquet_copy_into_sf_query_generator``

In [13]:
# export
def parquet_copy_into_sf_query_generator(data_types: dict,
                                         database: str,
                                         schema: str,
                                         table_name: str,
                                         file_type: str,
                                         stage_name: str = None,
                                         data_lake_path: str = None,
                                         azure_path: str = None,
                                         azure_sas_token: str = None,
                                         pattern: str = None,
                                         skip_header: str = None,
                                         compression: str = None,
                                         field_delimiter: str = None,
                                         encoding: str = None,
                                         infer_dtypes: bool = False,
                                         header: bool = True):
    """
    Generates query to take data from a snowflake stage and puts the data
    directly into a table requested by this function and only if the table in sf
    doesn't exisit it will fail in the how to will show you how to not have it fail
    out, but the notebook shows how to make a snowflake with the data that is coming
    from adls.

    How to use:

    ```python
    cp_query = copy_into_sf_query_generator(stage_name='sdsdestage_test/',
                                            data_lake_path='testing_stage/',
                                            table_name='sdsde_DELETE_TEST_TABLE',
                                            database=sf.sfDatabase,
                                            schema=sf.sfSchema,
                                            skip_header='1',
                                            field_delimiter=',',
                                            encoding='UTF-8',
                                            file_type='csv',
                                            pattern='.*.csv')
    try:
        sf.run_str_query(cp_query)
    except Exception as e:
        logger.error(f'Error Created Trying to Copy Into sf table {e}')
        logger.warning('Most this table needs to be initialized' )
    ```

    Args:
    * stage_name (str): name of the stage in snowflake
    * data_lake_path (str): Path inside of the created stage used when stage isn't the direct path
    * database (str): snowflake database
    * schema (str): snowflake schema
    * table_name (str): snowflake table name that data will be put into
    * file_type (str): file type that will be ingested
    * pattern (str): either this is grabbing many files or the data_lake_path will be point to one file to ingest
    if there is a patter like .*.csv it will use regex to find files with this regex pattern
    * skip_header (str, optional): during development files with columns failed so skipping the header with column names is needed
    * compression (str, optional): what type of compression is being used for this set of data
    * field_delimiter (str, optional): file type deliminter like ; or /t
    * encoding (str, optional): file encoding method used to parse files on snowflakes side

    Returns:
    * str: snowflake query to create stage
    """
    values = vars()
    del values['data_types']
    del values['infer_dtypes']
    del values['header']
    file_sql = 'copy_into_sf_table_parquet.sql' if azure_path is None else 'copy_into_sf_table_direct_parquet.sql'
    with open(os.path.join(files.__path__[0], file_sql), 'r') as f:
        lines = f.read()
        f.close()
    for k, v in values.items():
        if v is not None:
            lines = lines.replace(f'<{k.upper()}>', v)
        else:
            lines = lines.replace(f'<{k.upper()}>', '').replace(f"{k} = ''", '').replace(f"{k} =", '')
    query = """SELECT FEATURES_HERE"""
    ind = 0
    columns = len(data_types.keys())
    if header is True:
        for k, v in data_types.items():
            query = query.replace('FEATURES_HERE', f'$1:"{k}", FEATURES_HERE')
            if ind < columns:
                query = query.replace(', FEATURES_HERE \n', '')
            else:
                query = query.replace('FEATURES_HERE \n', '')
            ind += 1
    else:
        for k, v in data_types.items():
            query = query.replace('FEATURES_HERE', f'$1:"_COL_{ind}::{return_sf_type(str(v), varchar=False, infer=infer_dtypes)}" as {k}, FEATURES_HERE')
            if ind < columns:
                query = query.replace(', FEATURES_HERE \n', '')
            else:
                query = query.replace('FEATURES_HERE \n', '')
            ind += 1
    query = query.replace(', FEATURES_HERE', '')
    lines = lines.replace('<SELECT_STATEMENT>', query)
    if azure_path is None:
        logger.info(f'\n{lines}')
    else:
        logger.info(f'\n{lines.replace(azure_sas_token, "**MASKED**")}')
    return lines

### ``create_sf_table_from_df`` & ``return_sf_type``

In [14]:
# export
def clean_special_chars(text):
    """
    small nlp clean up tool to take odd characters that could be
    in vendor data inside of column names and then replaces empty
    spaces with ``_``

    Args:
        text (str): dataframe column names as strings

    Returns:
        str: clean column name
    """
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'  # noqa:
    punct += '©^®` <→°€™› ♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'  # noqa:
    for p in punct:
        text = text.replace(p, ' ')
        text = text.replace(' ', '_')
    return text

def create_sf_table_from_df(df: pd.DataFrame, table_name_sf: str, varchar: bool):
    """
    Dynamically create a table from a dataframe and
    change the dtypes to snowflake dytpes this may have
    a limitation, but can be added.

    Args:
    * df (pd.DataFrame): data frame to get dtypes
    * table_name_sf (str): snowflake table name
    * varchar: (bool, optional): this will default all dytpes to varchars if True.
    """
    select_query = f'''
        create or replace table {table_name_sf} (FEATURES_HERE);
        '''
    for k, v in dict(df.dtypes).items():
        select_query = select_query.replace('FEATURES_HERE', f'{clean_special_chars(k)} {return_sf_type(str(v), varchar=varchar)}, FEATURES_HERE')
    select_query = select_query.replace(', FEATURES_HERE', '')
    logger.info(select_query)
    logger.warning('Note: Remember this is created data types for sf based on this file if')
    return select_query

def create_sf_table_from_dict(columns_and_types: dict, table_name_sf: str, varchar: bool, infer_types: bool = False):
    """
    Dynamically create a table from a dataframe and
    change the dtypes to snowflake dytpes this may have
    a limitation, but can be added.

    Args:
    * df (pd.DataFrame): data frame to get dtypes
    * table_name_sf (str): snowflake table name
    * varchar: (bool, optional): this will default all dytpes to varchars if True.
    """
    select_query = f'''
        create or replace table {table_name_sf} (FEATURES_HERE);
        '''
    for k, v in columns_and_types.items():
        select_query = select_query.replace('FEATURES_HERE', f'{clean_special_chars(k)} {return_sf_type(str(v), infer=infer_types, varchar=varchar)}, FEATURES_HERE')
    select_query = select_query.replace(', FEATURES_HERE', '')
    logger.info(select_query)
    logger.warning('Note: Remember this is created data types for sf based on this file if')
    return select_query

def return_sf_type(dtype: str, varchar: bool, infer: bool = True):
    """
    Simple utility function that tries to make the process of making a
    snowflake table dynamic and there are of course situtation this will fail
    this is trying to solve 80% of all the data types

    TODO: make more robust if possible, but for now complications will lead
    to the user needing to just default everything to a VARCHAR search bool
    types that didn't work at the time of creation of this function

    Args:
    * dtype (str): dtype from a df in sting form
    * varchar (bool): to default all variables to VARCHAR

    Returns:
    * str: snowflake dtype
    """
    if infer is True:
        if varchar is True:
            dtype = 'VARCHAR'
        elif 'int' in dtype.lower():
            dtype = 'NUMBER'
        elif 'float' in dtype.lower():
            dtype = 'FLOAT'
        elif 'object' in dtype.lower():
            dtype = 'VARCHAR'
        elif 'bool' in dtype.lower():
            dtype = 'VARCHAR'  # TODO: Limitation found before change once resloved by sf
        elif 'date' in dtype.lower():
            dtype = 'DATETIME'  # TODO: Might break with certain datetimes most generic
        else:
            logger.error('odd dtype not seen needs to be resloved...')
            sys.exit()
    else:
        return dtype
    return dtype

In [15]:
show_doc(return_sf_type)
show_doc(create_sf_table_from_df)

<h4 id="return_sf_type" class="doc_header"><code>return_sf_type</code><a href="__main__.py#L63" class="source_link" style="float:right">[source]</a></h4>

> <code>return_sf_type</code>(**`dtype`**:`str`, **`varchar`**:`bool`, **`infer`**:`bool`=*`True`*)

Simple utility function that tries to make the process of making a
snowflake table dynamic and there are of course situtation this will fail
this is trying to solve 80% of all the data types

TODO: make more robust if possible, but for now complications will lead
to the user needing to just default everything to a VARCHAR search bool
types that didn't work at the time of creation of this function

Args:
* dtype (str): dtype from a df in sting form
* varchar (bool): to default all variables to VARCHAR

Returns:
* str: snowflake dtype

<h4 id="create_sf_table_from_df" class="doc_header"><code>create_sf_table_from_df</code><a href="__main__.py#L21" class="source_link" style="float:right">[source]</a></h4>

> <code>create_sf_table_from_df</code>(**`df`**:`DataFrame`, **`table_name_sf`**:`str`, **`varchar`**:`bool`)

Dynamically create a table from a dataframe and
change the dtypes to snowflake dytpes this may have
a limitation, but can be added.

Args:
* df (pd.DataFrame): data frame to get dtypes
* table_name_sf (str): snowflake table name
* varchar: (bool, optional): this will default all dytpes to varchars if True.

## Overview

**Why?**

This will ease the process of moving snowflake data to ADLS as well as ease the process of taking data from the ADLS to snowflake. This has been seen in a couple places like APT and adobe click stream and trying to get a good way of attacking these problems outside of using ``SnowflakeConnect`` and taking a little more advantage.

Another reason for this is to we will be able to take in the data from ADLS for our models with this method. I haven't quite thought about how this is going to work, but I do know that this is a really good use case. This idea need fleshing out

**Do We Need It?**

Do we need anything that gets added to the sdsde Library not really we can always hack our way through, but the point of software is to make the 80-90% of actions simpler and unified so when things get upgrades or break that one fix will fix all down stream use cases. 

I am sure that a class will be written to make this all a lot simpler and less lines of code, but learning into *patches* is a gap in my current knowledge sphere.  I have seen/heard about monkey patching a function for these flexible use cases when I have time I will look into that as it's a good skill I believe. This is seen a lot in Dask/Snorkel.

**Does it work?**

Sure does, but there are probably things that need to be added and there are pieces that haven't been explored yet like partition by.

**What are things to still look into?**

Probably a lot, but right now the ability to name the files that are being dumped into azure for example APT has to have a naming convention for the files we drop into their SFTP this means the files can't be ``data_0_0.csv``, but currently haven't  looked into that much.

Haven't tested complex queries in the SF to ADLS just did a simple where Brian Trost because he is a good candidate employee as well as a skier that has the worst skierability on the team and make the algo for skierability favor him (All Facts).


###### USE Case 1: SF -> ADLS

Step 1: Make A Data Stage In SnowFlake ``make_data_lake_stage``

Step 2: Send data to adls stage ``copy_into_adls_query_generator``

What you do from there is fully up to you, but if you have an stfp use case check out the SFTP.ipynb for help on that


###### USE Case 2: ADLS-> SF

Step 1: Make A Data Stage In SnowFlake ``make_data_lake_stage``

Step 2: Create a table for data from ADLS to move into ``create_sf_table_from_df``

Step 3: Move ADLS Data to Snowflake table ``copy_into_sf_query_generator``

> In this notebook we take data from snowflake and move to to adls and then move that data to another table


## GZIP Example

### Recommended Stage Approach From SnowFlake

In [25]:
#skip
from sdsde.snowflake.query import SnowflakeConnect
from sdsde.wrapper.azurewrapper import blob_puller
from sdsde.azure.filehandling import FileHandling

logging.basicConfig(level=logging.INFO)
logging.getLogger("azure.core").setLevel(logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.CRITICAL)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)


sf = SnowflakeConnect(sfAccount=os.environ['sfAccount'],
                   sfUser=os.environ['sfUser'],
                   sfPswd=os.environ['sfPswd'],
                   sfWarehouse=os.environ['sfWarehouse'],
                   sfDatabase=os.environ['sfDatabase'],
                   sfSchema=os.environ['sfSchema'],
                   sfRole=os.environ['sfRole'])

stage_query = make_data_lake_stage(stage_name='sdsdestage_test',
                                   account=os.environ['azure_account'],
                                   container='sdsdetesting',
                                   data_lake_path='testing_stage/',
                                   compression='GZIP',
                                   sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                   file_type='csv',
                                   field_optionally_enclosed_by= '\042' # '\042' # double quotes ascii # '\047' single quote
                                )
sf.run_str_query(stage_query)
sg_query = copy_into_adls_query_generator(stage_name='sdsdestage_test',
                                          sf_query=r'SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100',
                                          data_lake_path='testing_stage/',
                                          max_file_size = '32000',
                                          header='True',
                                          over_write='True')
sf.run_str_query(sg_query)
cp_query = copy_into_sf_query_generator(stage_name='sdsdestage_test/',
                                        data_lake_path='testing_stage/',
                                        table_name='sdsde_DELETE_TEST_TABLE',
                                        database=sf.sfDatabase,
                                        schema=sf.sfSchema,
                                        skip_header='1',
                                        compression='GZIP',
                                        file_type='csv',
                                        pattern='.*.csv.gz')

fh = FileHandling(os.environ['connection_str'])
file_list = fh.ls_blob(container_name='sdsdetesting',  path='testing_stage/testing_stage', recursive=True)
logger.info(f'files in blob {file_list}')

blob_puller(files=['testing_stage/testing_stage/' + file_list[0]],
            connection_str=fh.connection_string,
            container_name='sdsdetesting',
            drop_location='.',
            overwrite=True)

df = pd.read_csv(file_list[0])
logger.info(f'df size {df.shape}')
create_table_sql = create_sf_table_from_df(df=df, 
                                           table_name_sf='MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE',
                                           varchar=True)
sf.run_str_query(create_table_sql)

try:
    sf.run_str_query(cp_query)
except Exception as e:
    logger.error(f'Error Created Trying to Copy Into sf table {e}')
    logger.warning('Most this table needs to be initialized' )

df = sf.run_str_query('SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;')
for f in fh.ls_blob(container_name='sdsdetesting', path='testing_stage/', recursive=True):
    if f == 'testing_stage':
        pass
    else:
        fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/' + f)
fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/')
assert df.shape == (100, 8), 'Copy Into Failed to Load Expected'
sf.run_str_query("DROP TABLE MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;")
sf.run_str_query('DROP STAGE sdsdestage_test;')

INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:connection to snowflake successful
INFO:__main__:Datalake Stage path that copy into will use testing_stage/
INFO:__main__:stage_query: 
 create or replace stage sdsdestage_test
url='azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = csv     compression = GZIP field_optionally_enclosed_by = '"')
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Stage area SDSDESTAGE_TEST successfully created.
INFO:__main__:
COPY INTO @sdsdestage_test/testing_stage/
FROM  (SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100)

max_file_size = 32000
ove

### Using Azure Paths Only

In [26]:
#skip
sg_query = copy_into_adls_query_generator(sf_query=r'SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100',
                                          azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                          azure_path='azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/',
                                          max_file_size = '32000',
                                          compression='GZIP',
                                          file_type='CSV',
                                          field_optionally_enclosed_by= '\042',
                                          header='True',
                                          over_write='True')
sf.run_str_query(sg_query)

INFO:__main__:
COPY INTO 'azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/'
FROM  (SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100)

max_file_size = 32000
OVERWRITE = True
file_format = (type = CSV field_optionally_enclosed_by = '"'     compression = GZIP  )
credentials= (azure_sas_token = '**MASKED**')
header = True;
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off


Unnamed: 0,rows_unloaded,input_bytes,output_bytes
0,100,12004,1396


```python
try:
    sf.run_str_query(cp_query)
except Exception as e:
    logger.error(f'Error Created Trying to Copy Into sf table {e}')
    logger.warning('Most this table needs to be initialized' )

```

> Note: This above failed telling the user that this action can not be completed when working with this I have allowed the python session to keep going on to the next thing as I have a try/catch here, but this is really to show the user what is happening if using this in production we would most likely want this to completely fail out with something like a sys.exit(1)

This above is known however for now while this is in testing keeping this here is the ideal way of approaching this right now what I would do in a class is catch this error and do what happens below.

> Note: Below there is only one file in the blob location what we would want is to do ``file_list[0][0]`` most likely to only grab one of the files.

Also make sure you notice the create varchar = False what this means is that the table will not default to only VarChars and will do it's best to use pandas to understand the dtypes of the file and another hint here is make sure low_memory=False is on if you to not run into issues. Varchar default was created because vendors don't take care of their data i.e. MTA (nielson).

In [27]:
#skip
fh = FileHandling(os.environ['connection_str'])
file_list = fh.ls_blob(container_name='sdsdetesting',  path='testing_stage/', recursive=True)
logger.info(f'files in blob {file_list}')

blob_puller(files=['testing_stage/' + file_list[0]],
            connection_str=fh.connection_string,
            container_name='sdsdetesting',
            drop_location='.',
            overwrite=True)

df = pd.read_csv(file_list[0])
logger.info(f'df size {df.shape}')

create_table_sql = create_sf_table_from_df(df=df, 
                                           table_name_sf='MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE',
                                           varchar=True)
sf.run_str_query(create_table_sql)

INFO:__main__:files in blob ['data_0_0_0.csv.gz']
INFO:sdsde.azure.filehandling:testing_stage/data_0_0_0.csv.gz to ./data_0_0_0.csv.gz
INFO:__main__:df size (100, 8)
INFO:__main__:
        create or replace table MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE (ECID VARCHAR, LIKELIHOODTORETURNRATE VARCHAR, LIKELIHOODTORETURNLABEL VARCHAR, FISCALYEAR VARCHAR, MODELTYPE VARCHAR, EXPERIMENT VARCHAR, MODELNAME VARCHAR, UPLOADTIME VARCHAR);
        
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Table SDSDE_DELETE_TEST_TABLE successfully created.


Now that we have created the expected table of interest we are trying to create we are now ready to attempt this again to have an ADLS file be directly dumped into a snowflake table.

In [28]:
#skip
cp_query = copy_into_sf_query_generator(azure_path='azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/',
                                        azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                        table_name='sdsde_DELETE_TEST_TABLE',
                                        database=sf.sfDatabase,
                                        schema=sf.sfSchema,
                                        skip_header='1',
                                        compression='GZIP',
                                        file_type='csv',
                                        pattern='.*.csv.gz')

sf.run_str_query(cp_query)

df = sf.run_str_query('SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;')
sf.run_str_query("DROP TABLE MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;")

assert df.shape == (100, 8), 'Copy Into Failed to Load Expected'

for f in fh.ls_blob(container_name='sdsdetesting', path='testing_stage/', recursive=True):
    if f == 'testing_stage':
        pass
    else:
        fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/' + f)
fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/')
os.unlink(file_list[0])

INFO:__main__:
copy into MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE
from 'azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/'
file_format = (type = csv   compression = GZIP skip_header = 1)
credentials= (azure_sas_token = '?sv=2019-12-12&ss=bfqt&srt=sco&sp=rwdlacupx&se=2031-01-22T06:17:14Z&st=2021-01-21T22:17:14Z&spr=https&sig=kIHogByJjyVWyL6XupA0CBUB1iw12%2FeXWFQiOj5fB5c%3D')
pattern = '.*.csv.gz';
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snow

## CSV Example

In [29]:
#skip
stage_query = make_data_lake_stage(stage_name='sdsdestage_test',
                                   account=os.environ['azure_account'],
                                   container='sdsdetesting',
                                   data_lake_path='testing_stage/',
                                   field_delimiter=r",",
                                   compression='None',
                                   encoding='UTF-8',
                                   sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                   file_type='csv'
                                   )
sf.run_str_query(stage_query)
sg_query = copy_into_adls_query_generator(stage_name='sdsdestage_test',
                                          azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                          sf_query=r'SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100',
                                          data_lake_path='testing_stage/',
                                          max_file_size = '32000',
                                          header='True',
                                          over_write='True')
sf.run_str_query(sg_query)

INFO:__main__:Datalake Stage path that copy into will use testing_stage/
INFO:__main__:stage_query: 
 create or replace stage sdsdestage_test
url='azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = csv field_delimiter = ',' encoding = 'UTF-8' compression = None  )
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Stage area SDSDESTAGE_TEST successfully created.
INFO:__main__:
COPY INTO @sdsdestage_test/testing_stage/
FROM  (SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100)

max_file_size = 32000
overwrite = True
header = True;
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflak

Unnamed: 0,rows_unloaded,input_bytes,output_bytes
0,100,10578,10578


In [30]:
#skip
cp_query = copy_into_sf_query_generator(stage_name='sdsdestage_test/',
                                        data_lake_path='testing_stage/',
                                        table_name='sdsde_DELETE_TEST_TABLE',
                                        database=sf.sfDatabase,
                                        schema=sf.sfSchema,
                                        skip_header='1',
                                        field_delimiter=',',
                                        encoding='UTF-8',
                                        file_type='csv',
                                        pattern='.*.csv')
try:
    sf.run_str_query(cp_query)
except Exception as e:
    logger.error(f'Error Created Trying to Copy Into sf table {e}')
    logger.warning('Most this table needs to be initialized' )

INFO:__main__:
copy into MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE
from @sdsdestage_test/testing_stage/
file_format = (type = csv field_delimiter = ',' encoding = 'UTF-8'   skip_header = 1)
pattern = '.*.csv';
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
ERROR:__main__:Error Created Trying to Copy Into sf table (snowflake.connector.errors.ProgrammingError) SQL compilation error:
Table 'MACHINELEARNINGOUTPUTS.DEV.SDSDE_DELETE_TEST_TABLE' does not exist
[SQL: copy into MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE
from @sdsdestage_test/testing_stage/
file_format = (type = csv field_delimiter = ',' encoding = 'UTF-8'   skip_header = 1)
pattern = '.*.csv';]
(Background on this error at: https://sqlalche.me/e/14/f405)


In order fix this problem ``create_sf_table_from_df`` is a utility that you can use or you can do this any other way you see fit. There are many ways to go about doing this, but this was a nifty way that the STFP.py does this so using this here seemed right to allow users to have this in the utility belt.

1. Here we are going to use sdsde to pull from the stage ``sdsdestage_test``, which is really just inside of azure data lake v2 not to be confused with the old azure blob. Remember that a stage is really just a location in azure ``azure://<your_azure_>.blob.core.windows.net/sdsdetesting/testing_stage/`` = ``sdsdestage_test``

In [31]:
#skip
fh = FileHandling(os.environ['connection_str'])

file_list = fh.ls_blob(container_name='sdsdetesting',  path='testing_stage/testing_stage', recursive=True)
logger.info(f'files in blob {file_list}')
blob_puller(files=['testing_stage/testing_stage/' + file_list[0]],
            connection_str=fh.connection_string,
            container_name='sdsdetesting',
            drop_location='.',
            overwrite=True)
df = pd.read_csv(file_list[0])
# df = pd.read_csv(file_list[0], header=None,
#             names=sf.run_str_query("SELECT * FROM BIDE_EDWDB_ARA_PROD.dbo.FactScan LIMIT 2;").columns)
logger.info(f'Query Size {df.shape}')
create_table_sql = create_sf_table_from_df(df=df, 
                                           table_name_sf='MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE',
                                           varchar=False)
sf.run_str_query(create_table_sql)
sf.run_str_query(cp_query)
sf.run_str_query("DROP TABLE MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;")
sf.run_str_query('DROP STAGE sdsdestage_test;')
for f in fh.ls_blob(container_name='sdsdetesting', path='testing_stage/', recursive=True):
    if f == 'testing_stage':
        pass
    else:
        fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/' + f)
fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/')
os.unlink(file_list[0])

INFO:__main__:files in blob ['data_0_0_0.csv']
INFO:sdsde.azure.filehandling:testing_stage/testing_stage/data_0_0_0.csv to ./data_0_0_0.csv
INFO:__main__:Query Size (100, 8)
INFO:__main__:
        create or replace table MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE (ECID NUMBER, LIKELIHOODTORETURNRATE FLOAT, LIKELIHOODTORETURNLABEL VARCHAR, FISCALYEAR VARCHAR, MODELTYPE VARCHAR, EXPERIMENT VARCHAR, MODELNAME VARCHAR, UPLOADTIME VARCHAR);
        
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Table SDSDE_DELETE_TEST_TABLE successfully created.
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loa

## Parquet  Example

### Stage Approach 

In [33]:
#skip
stage_query = make_data_lake_stage(stage_name='sdsdestage_test',
                                   account=os.environ['azure_account'],
                                   container='sdsdetesting',
                                   data_lake_path='testing_stage/',
                                   sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                   file_type='parquet'
                                  )

sf.run_str_query(stage_query)

sg_query = copy_into_adls_query_generator(stage_name='sdsdestage_test',
                                          azure_sas_token=os.environ['DATALAKE_SAS_TOKEN_SECRET'],
                                          sf_query=r'SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100',
                                          data_lake_path='testing_stage/',
                                          max_file_size = '32000',
                                          header='True',
                                          over_write='True')

sf.run_str_query(sg_query)

from sdsde.wrapper.azurewrapper import blob_puller
from sdsde.azure.filehandling import FileHandling

fh = FileHandling(os.environ['connection_str'])
file_list = fh.ls_blob(container_name='sdsdetesting',  path='testing_stage/testing_stage', recursive=True)
logger.info(f'files in blob {file_list}')
blob_puller(files=['testing_stage/testing_stage/' + file_list[0]],
            connection_str=fh.connection_string,
            container_name='sdsdetesting',
            drop_location='.',
            overwrite=True)
df = pd.read_parquet(file_list[0])

logger.info(f'Query Size {df.shape}')

create_table_sql = create_sf_table_from_df(df=df, 
                                           table_name_sf='MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE',
                                           varchar=False)
sf.run_str_query(create_table_sql)

cp_query = parquet_copy_into_sf_query_generator(data_types=dict(df.dtypes),
                                                stage_name='sdsdestage_test/',
                                                data_lake_path='testing_stage/',
                                                table_name='sdsde_DELETE_TEST_TABLE',
                                                database=sf.sfDatabase,
                                                schema=sf.sfSchema,
                                                file_type='parquet',
                                                pattern='.*.parquet',
                                                infer_dtypes=True)
sf.run_str_query(cp_query)
sf.run_str_query("DROP TABLE MACHINELEARNINGOUTPUTS.DEV.sdsde_DELETE_TEST_TABLE;")
sf.run_str_query('DROP STAGE sdsdestage_test;')
for f in fh.ls_blob(container_name='sdsdetesting', path='testing_stage/', recursive=True):
    if f == 'testing_stage':
        pass
    else:
        fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/' + f)
fh.rm_files(container_name='sdsdetesting', delete_path='testing_stage/')
os.unlink(file_list[0])

INFO:__main__:Datalake Stage path that copy into will use testing_stage/
INFO:__main__:stage_query: 
 create or replace stage sdsdestage_test
url='azure://vaildtscadls.blob.core.windows.net/sdsdetesting/testing_stage/'
credentials=(azure_sas_token='**MASKED**')
encryption=(type= 'NONE')
file_format = (type = parquet        )
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:executing query
INFO:sdsde.snowflake.query:data loaded from snowflake
INFO:sdsde.snowflake.query:connection to snowflake has been turned off
INFO:sdsde.snowflake.query:Stage area SDSDESTAGE_TEST successfully created.
INFO:__main__:
COPY INTO @sdsdestage_test/testing_stage/
FROM  (SELECT * FROM MACHINELEARNINGOUTPUTS.DEV.DL_LTR LIMIT 100)

max_file_size = 32000
overwrite = True
header = True;
INFO:sdsde.snowflake.query:testing connection
INFO:sdsde.snowflake.query:sqlalchemy snowflake engine created
INFO:sdsde.snowflake.query:execut

# Create

In [16]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_azure.ipynb.
Converted 02_utils_dataframes.ipynb.
Converted 02_utils_parseyaml.ipynb.
Converted 02_utils_stfp.ipynb.
Converted 02_utils_traininghelpers.ipynb.
Converted 02_utils_traininghelpers_fastai.ipynb.
Converted 03_dstools_preparedata.ipynb.
Converted 04_snowflake_copyinto.ipynb.
Converted 04_snowflake_copyinto2.ipynb.
Converted 04_snowflake_query.ipynb.
Converted 05_azure_wrappers.ipynb.
Converted 06_modeling_inference.ipynb.
Converted 06_modeling_inference_fastai.ipynb.
Converted 06_modeling_premodel.ipynb.
Converted 06_modeling_preprocessing.ipynb.
Converted 06_modeling_preprocessing_fastai.ipynb.
Converted 06_modeling_training.ipynb.
Converted 06_modeling_training_fastai.ipynb.
Converted 07_Binary_Classification_Fastai_Example_Notebook.ipynb.
Converted index.ipynb.
