In [1]:
# Core python utils
import os
from contextlib import contextmanager

import pandas as pd

import yaml         # %pip install pyyaml
import pydataset
# import helpers

# Acquire config

I've chosen to go with .yaml as the config file language.  Although working with .json or .conf would mean that the project could avoid adding another module dependency, I wanted something that could handle a heirarchical data structure and also support comments.  In the end .yaml won.

To work with yaml, we require the help of `pyyaml`

In [2]:
import yaml
with open('config.yaml') as f:
    CFG = yaml.safe_load(f)

print(type(CFG))

<class 'dict'>


With `pyyaml` installed, we can parse the configuration file easily, loading it into a dictionary object.

In [3]:
def load_config(path='config.yaml'):
    with open('config.yaml') as f:
        config = yaml.safe_load(f)
    return config

CFG = load_config()

In [28]:
def get_config(dot_notation:str, config:dict = CFG) ->dict:
    """Gets a specific key from the config object."""
    for key in dot_notation.split('.'):
        config = config.get(key)
    # if not config: 
    #     raise ValueError(f'The specified path "{dot_notation}" not found in config')
    return config

get_config('acquire.cache')

{'path': './cache/data.csv',
 'refresh': True,
 'auto_update_refresh_config': True}

In [5]:
'acquire' in CFG

True

In [6]:
supported_acquire_methods = [
    'pydataset',
    'script',
    'csv', #TODO
    'json', #TODO
    # 'sql'
    ]

For now I am only planning on supporting the intake of data through pydataset or a custom script.

Pydataset is easy enough.  All I need to do is ask which dataset and then use pydataset to load the data.

Scripting is a bit more complex.  The only method I can find to load data from an external script is to use import.  For that purpose I'm implementing importlib.  I would like this section to be flexible, so I'm going to try to implement it in a way where the script can be stored anywhere in the filesystem.

## Getting script data

In [7]:
from contextlib import contextmanager

## Allows the temporary changing of python's directory context.  Useful for running scripts in a different context.

@contextmanager
def cwd(path):
    oldpwd = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(oldpwd)

with cwd('/'):
    print(f'inside context manager: {os.getcwd()}')
print(f'after context manager: {os.getcwd()}')

inside context manager: a:\
after context manager: a:\github\codeup-data-science\boilerplate-ds-project


In [8]:
# path = 'C:\\Users\\Crux\\Desktop\\script.py'

def get_script_data(path: str) -> object:
    """Executes a python script in the specified path.
    
    Args:
        path: Path to a python script.  The script must contain a DATA variable to be imported.
    
    Returns:
        DATA: A data object in a suitable format.  Usually a Pandas DataFrame.

    Raises:
        Exception: If DATA variable is not found in target script.
    """
    # Imports are within the function since they are only needed if this function is called
    import importlib
    import sys
    
    
    # Allow target script to run in its own relative paths
    directory, name = os.path.split(path)
    with cwd(directory): 
        # Use importlib to import and run the script
        spec = importlib.util.spec_from_file_location(name, path)
        custom_script = importlib.util.module_from_spec(spec)
        sys.modules[name] = custom_script
        spec.loader.exec_module(custom_script)
        
    try:
        return custom_script.DATA
    except:
        raise Exception(f'Variable: DATA does not exist in script located at "{path}". Assign the data you with to load to the DATA variable. eg. `DATA = pydataset.data(\'iris\')')


In [9]:
#### This works, but don't want to wait every time.
# path = '../detection_project/acquire.py'
# get_script_data(path).head()

In [10]:
path = 'C:\\Users\\Crux\\Desktop\\script.py'
df = get_script_data(path)
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [11]:
os.path.split(path)

('C:\\Users\\Crux\\Desktop', 'script.py')

### Using config file to get script data

In [41]:
temp = pd.DataFrame()
loc = get_config('acquire.script')
if loc:
    temp = get_script_data(loc)
temp.head()

script.py


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [45]:
def read_script(config=CFG):
    path = get_config('acquire.script')
    if path:
        if os.path.exists(path):
            data = get_script_data(path)
            return data
        else:
            raise Exception(f'The path specified in "acquire.script": "{path}" does not exist.')
    return None

In [44]:
read_script().head(2)

script.py


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa


## Pydataset from config

In [46]:
def read_pydataset(config=CFG):
    import pydataset
    dataset = get_config('acquire.pydataset')
    try:
        return pydataset.data(dataset)
    except:
        return None

## Cacheing via config

In [13]:
cache_cfg = get_config('acquire.cache')
cache_cfg

{'path': './cache/data.csv',
 'refresh': True,
 'auto_update_refresh_config': True}

In [14]:
path, name = os.path.split(cache_cfg['path'])
path, name

('./cache', 'data.csv')

In [15]:
# Might be useful for allowing different file types later.
os.path.splitext(name)

('data', '.csv')

In [16]:
if not os.path.exists(path):
    os.makedirs(path)

In [48]:
import pandas as pd
def write_cache(data: pd.DataFrame, config=CFG):

    cache_cfg = get_config('acquire.cache', config)
    path = cache_cfg.get('path')
    folder, name = os.path.split(path)
    
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    data.to_csv(path, index=False)

In [18]:
write_cache(df)

In [19]:
def read_cache(config=CFG):
    cache_cfg = get_config('acquire.cache', CFG)
    path = cache_cfg.get('path')

    if os.path.exists(path):
        data = pd.read_csv(path)
        return data
    else:
        return None

## Wrangle logic

In [20]:
# Get a fresh config
# Try load cache if not refresh == True
# If no data
    # Try load script
# If no data
    # Try load pydataset
# If no data
    # raise exception
# If cache tried but failed write cache 
# If refresh is true write cache


In [49]:
def wrangle_from_config(path='config.yaml'):
    
    # Grab a fresh copy of config to avoid having to restart kernal.
    config = load_config(path)
    data = None
    
    cache_config = get_config('acquire.cache')
    if cache_config:
        cache_refresh = get_config('acquire.cache.refresh', config)
        if not cache_refresh:
            data = read_cache(config)

    if data is None:
        data = read_script(config)

    if data is None:
        data = read_pydataset(config)
    
    if data is None:
        raise Exception('Data not found. Is configuration file correct?')

    if cache_config:
        cache_path = get_config('path', cache_config)
        if cache_path:
            if (not os.path.exists(cache_path)) or cache_refresh:
                write_cache(data, config)
    
    return data
    

In [54]:
wrangle_from_config().head()

Unnamed: 0.1,Unnamed: 0,date,time,ip,endpoint,user_id,cohort_id,cohort_name,slack,start_date,end_date,program_id
0,0,2018-01-26,09:55:03,97.105.19.61,/,1,8.0,Hampton,#hampton,2015-09-22,2016-02-06,1.0
1,1,2018-01-26,09:56:02,97.105.19.61,java-ii,1,8.0,Hampton,#hampton,2015-09-22,2016-02-06,1.0
2,2,2018-01-26,09:56:05,97.105.19.61,java-ii/object-oriented-programming,1,8.0,Hampton,#hampton,2015-09-22,2016-02-06,1.0
3,3,2018-01-26,09:56:06,97.105.19.61,slides/object_oriented_programming,1,8.0,Hampton,#hampton,2015-09-22,2016-02-06,1.0
4,4,2018-01-26,09:56:24,97.105.19.61,javascript-i/conditionals,2,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2.0
