## Debugging Pandas

Working through different debugging techniques with Pandas to understand process steps.

### 1. Datasets

In [27]:
import pandas as pd
from IPython.display import display, HTML

In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
fname = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'

In [5]:
autos = pd.read_csv(fname, low_memory=False)

In [6]:
autos.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [43]:
# Class statements
class DebugException(Exception):
    pass

def debug_var(thing, *, name='debug_item', raise_ex=True):
    globals()[name] = thing
    if raise_ex:
        raise DebugException(
            f"Debug stop at index {thing.name}"
        )
    return thing

In [49]:
# Functions
# Timezone conversion
def to_tz(df_, time_col, tz_offset, tz_name):
    return (df_
            .groupby(tz_offset)
            [time_col]
            .transform(lambda s: pd.to_datetime(s)
                      .dt.tz_localize(s.name, ambiguous='infer')
                      .dt.tz_convert(tz_name))
    )

# Displaying more content on the dataFrame via sampling and HTML
def show(df_, rows=20, cols=30, title=None):
    if title:
        display(HTML(f'<h2>{title}</h2>'))
    with pd.option_context('display.min_rows', rows,
                          'display.max_columns', cols):
        display(df_)
    return df_

# Display the intermediate state of the DataFrame
def get_var(df, var_name):
    globals()[var_name] = df
    return df

# Understand error handling within Jupyter Notebook instance
def err(*args):
    1/0

# Main tweak functions for the dataset
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
           'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
            [cols]
            .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
                    displ=autos.displ.fillna(0).astype('float16'),
                    drive=autos.drive.fillna('Other').astype('category'),
                    automatic=autos.trany.str.contains('Auto'),
                    speeds=autos.trany.str.extract(r'(\d)+').fillna('20').astype('int8'),
                    offset=autos.createdOn.str.extract(r'\d\d:\d\d ([A-Z]{3}?)').replace('EDT', 'EST5EDT'),
                    str_date=(autos.createdOn.str.slice(4,19) + ' ' + autos.createdOn.str.slice(-4)),
                    createdOn=lambda df_: to_tz(df_, 'str_date', 'offset', 'America/New_York'),
                    ffs=autos.eng_dscr.str.contains('FFS')
            )
            # Pipe the HTML display func
            # .pipe(show, rows=2, title='New Cols')
            # Pipe the intermediate DataFrame step
            # .pipe(get_var, 'new_cols')
            # Pipe error to highlight error handling
            # .pipe(err)
            .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',
                     'range': 'int16', 'year': 'int16', 'make': 'category'})
            .drop(columns=['trany', 'eng_dscr'])
    )

In [50]:
autos2 = tweak_autos(autos)
autos2

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,fuelCost08,make,model,range,createdOn,year,automatic,speeds,offset,str_date,ffs
0,19,21,25,4,2.000000,Rear-Wheel Drive,2000,Alfa Romeo,Spider Veloce 2000,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
1,9,11,14,12,4.898438,Rear-Wheel Drive,3850,Ferrari,Testarossa,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,False
2,23,27,33,4,2.199219,Front-Wheel Drive,1550,Dodge,Charger,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
3,10,11,12,8,5.199219,Rear-Wheel Drive,3850,Dodge,B150/B250 Wagon 2WD,0,2013-01-01 00:00:00-05:00,1985,True,3,EST,Jan 01 00:00:00 2013,
4,17,19,23,4,2.199219,4-Wheel or All-Wheel Drive,2700,Subaru,Legacy AWD Turbo,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4,2.199219,Front-Wheel Drive,1900,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41140,20,23,28,4,2.199219,Front-Wheel Drive,1850,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
41141,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41142,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True


In [48]:
# autos2.apply(debug_var,axis=1)