In [2]:
import seaborn as sns

###Scatter Plotting

In [1]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import patheffects
%matplotlib inline

In [6]:
def scatter_hfa(df, slices, setlims=None, markercolors=True, save=True):
    plt.rcParams['figure.figsize'] = 15, 15  # set image size
    max_marker_size = 900
    def add_labels(marker_labels, x, y):
        for label, x, y in zip(marker_labels, x, y):
            if y > x:
                ha = 'right'
                va = 'bottom'
                xytext = (-5, 5)
            else:
                ha = 'left'
                va = 'top'
                xytext = (5, -5)
            plt.annotate(label, xy =(x, y), xytext=xytext,
                textcoords='offset points', ha=ha, va=va, alpha=0.8)

    x = df.loc[df['Label'] == 'RECS','Heated Floor Area']
    y = df.loc[df['Label'] == 'HECTOR','Heated Floor Area']
    
    max_marker_size = 400
    max_count = max(df['count'])
    z = df.loc[df['Label'] == 'RECS','count'] / max_count * max_marker_size
    
    if markercolors:
        marker_colors = df.loc[df['Label'] == 'RECS',slices[-1]]
        marker_colors = [list(set(marker_colors)).index(i) for i in marker_colors.tolist()]
        plt.scatter(x, y, s=z, c=marker_colors, cmap=plt.cm.Set1, alpha=0.7)
    else:
        plt.scatter(x, y, s=z, c='b', alpha=0.7)
    
    if len(slices) == 1:
        marker_labels = df[slices[0]]
        title = 'Average Heated Floor Area, by {}'.format(slices[0])
    elif len(slices) == 2:
        marker_labels = zip(df[slices[0]], df[slices[1]])
        title = 'Average Heated Floor Area, by {} and {}'.format(slices[0],slices[1])

    add_labels(marker_labels, x, y)
    
    # y=x line
    ax = plt.gca()
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    if not setlims is None:
        print "Overwriting calculated scale limits ({}) with user-specified limits ({})".format(lims, setlims)
        for i, setlim in enumerate(setlims):
            if not setlim is None:
                lims[i] = setlim

    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)

    # +20% line
    ax.plot(lims, [lims[0], lims[1]*1.2], 'k-', alpha=0.1, zorder=0)

    # +20% line
    ax.plot(lims, [lims[0], lims[1]*0.8], 'k-', alpha=0.1, zorder=0)
    
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)

    title_size = 20
    tick_size = 16
   
    ax.set_xlabel('RECS', fontsize=title_size)
    ax.set_ylabel('NREL National-Scale Analysis', fontsize=title_size)
    plt.tick_params(axis='both', which='major', labelsize=tick_size)
    plt.title(title, fontsize=title_size)
    
    if save:
        filename = os.path.basename(outputdb_filename)
        filename = os.path.join('saved images','Scatter_{}_{}.png'.format(title, filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=200)
        trim_white(filename)

In [None]:
def draw_scatter_plot(df, cols, marker_labels, slicer, weighted_area=True, setlims=None, marker_colors=None, marker_shapes=None, size='medium', axis_titles=None, marker_color_all=None, show_labels=True, leg_label=None):
#     plt.rcParams['figure.figsize'] = 10, 10  # that's default image size for this interactive session
    def get_marker(i):
        return mpl.markers.MarkerStyle.filled_markers[i]

    def add_labels(marker_labels, x, y):
        if not show_labels:
            return
        for label, x, y in zip(marker_labels, x, y):
            if y > x:
                ha = 'right'
                va = 'bottom'
                xytext = (-5, 5)
            else:
                ha = 'left'
                va = 'top'
                xytext = (5, -5)
            plt.annotate(label, xy =(x, y), xytext=xytext,
                textcoords='offset points', ha=ha, va=va, alpha=0.8)
    
    if marker_color_all is None:
        marker_color_all = 'b'
    
    title_dict = {'by_region':'By Custom Region',
                  'by_vintage':'By Vintage',
                  'by_fuel':'By Space Heating Fuel Type',
                  'by_region_fuel':'By Custom Region and Space Heating Fuel Type',
                  'by_vintage_fuel':'By Vintage and Space Heating Fuel Type',
                  'by_region_vintage':'By Custom Region and Vintage'}
    try:
        title = title_dict[slicer]
    except KeyError:
        title = slicer
    x = df[cols[0]]
    y = df[cols[1]]

    if not marker_colors is None:
        if not marker_shapes is None:
            for i, shape in enumerate(set(marker_shapes)):
                this_marker = df.loc[df['level_0'] == shape, :]
                x = this_marker[cols[0]]
                y = this_marker[cols[1]]
                marker_colors = this_marker['level_1']
                marker_colors = [list(set(marker_colors)).index(j) for j in marker_colors.tolist()]
                marker_labels = zip(this_marker['level_0'], this_marker['level_1'])
                if weighted_area:
                    plt.scatter(x, y, c=marker_colors, cmap=plt.cm.Set1, marker='${}$'.format(shape[2:]), s=df['HouseCount'], alpha=0.7, label=leg_label)
                else:
                    plt.scatter(x, y, c=marker_colors, cmap=plt.cm.Set1, marker='${}$'.format(shape[2:]), s=1000, alpha=0.7, label=leg_label)
                add_labels(marker_labels, x, y)
        else:
            if weighted_area:
                plt.scatter(x, y, c=marker_colors, cmap=plt.cm.Set1, s=df['HouseCount'], alpha=0.7, label=leg_label)
            else:
                plt.scatter(x, y, c=marker_colors, cmap=plt.cm.Set1, s=50, alpha=0.7, label=leg_label)
            add_labels(marker_labels, x, y)
    else:
        if weighted_area:
#             plt.scatter(x, y, s=df['HouseCount'], c='k', alpha=1.0) # solid black for superimpoesed shadows
            plt.scatter(x, y, s=df['HouseCount'], c=marker_color_all, alpha=0.5, label=leg_label)
        else:
            plt.scatter(x, y, c=marker_color_all, alpha=0.5, label=leg_label)
        add_labels(marker_labels, x, y)

    # y=x line
    ax = plt.gca()
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]

    if not setlims is None:
        print "Overwriting calculated scale limits ({}) with user-specified limits ({})".format(lims, setlims)
        for i, setlim in enumerate(setlims):
            if not setlim is None:
                lims[i] = setlim

    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)

    # +20% line
    ax.plot(lims, [lims[0], lims[1]*1.2], 'k-', alpha=0.1, zorder=0)

    # +20% line
    ax.plot(lims, [lims[0], lims[1]*0.8], 'k-', alpha=0.1, zorder=0)
    
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    
    if size == 'large':
        title_size = 20
        axis_label_size = 24
        tick_size = 16
    elif size == 'medium':
        title_size = 16
        axis_label_size = 20
        tick_size = 12
    elif size == 'small':
        title_size = 16
        axis_label_size = 16
        tick_size = 12
    
#     
        
    if axis_titles is None:
        ax.set_xlabel('RECS', fontsize=axis_label_size)
        ax.set_ylabel('NREL National-Scale Analysis', fontsize=axis_label_size)
    else:
        ax.set_xlabel(axis_titles[0], fontsize=axis_label_size)
        ax.set_ylabel(axis_titles[1], fontsize=axis_label_size)
    plt.tick_params(axis='both', which='major', labelsize=tick_size)
    plt.title(title, fontsize=title_size)

In [2]:
def do_plot(slices, fields, size='medium', weighted_area=True, save=False, setlims=None, marker_color=False, marker_shape=False, version=None, marker_color_all=None, show_labels=True, leg_label=None):
    if size == 'large':
        plt.rcParams['figure.figsize'] = 20, 20 #20, 20  # set image size
        max_marker_size = 800
    elif size == 'medium':
        plt.rcParams['figure.figsize'] = 20, 10  # set image size
        max_marker_size = 400
    elif size == 'small':
        plt.rcParams['figure.figsize'] = 10, 5  # set image size
        max_marker_size = 400
    
    max_house_count = 0
    for i, slicer in enumerate(slices):
        hector_weights = getattr(hector, slicer)['weights']
        try:
            if hector_weights.max() > max_house_count:
                max_house_count = hector_weights.max()
        except ValueError:
            if hector_weights.max().max() > max_house_count:
                max_house_count = hector_weights.max().max()        
    
    for i, slicer in enumerate(slices):
        plt.subplot(1, len(slices), i+1)
        marker_colors = None
        marker_shapes = None
        if len(getattr(recs, slicer)[fields].shape) == 1:
            filename_adder = ''
            if version is None:
                df = pd.DataFrame([getattr(recs, slicer)[fields], getattr(hector, slicer)[fields]], index=['RECS', 'HECTOR'])
                #print df.head()
            elif version == 'Natural Gas Heating Only':
                df = pd.DataFrame([getattr(recs, slicer)[fields], getattr(hector, slicer)[fields]], index=['RECS', 'HECTOR'])
            elif version == 'Natural Gas Heating Only':
                df = pd.DataFrame([getattr(recs, slicer)[fields], getattr(hector, slicer)[fields]], index=['RECS', 'HECTOR'])
            elif version == 'Natural Gas Heating Only':
                df = pd.DataFrame([getattr(recs, slicer)[fields], getattr(hector, slicer)[fields]], index=['RECS', 'HECTOR'])
            elif version == 'Natural Gas Heating Only':
                df = pd.DataFrame([getattr(recs, slicer)[fields], getattr(hector, slicer)[fields]], index=['RECS', 'HECTOR'])
                
            df = df.transpose().reset_index()
            marker_labels = df.ix[:,0]
            cols = ['RECS','HECTOR']
            if marker_color:
                marker_colors = df['level_0']
        elif len(getattr(recs, slicer)[fields].shape) == 2:
            filename_adder = '_' + slicer
            df = pd.DataFrame([getattr(recs, slicer)[fields].stack(), getattr(hector, slicer)[fields].stack()], index=['RECS', 'HECTOR'])
            df = df.transpose().reset_index()
            df.columns = ['level_0','level_1'] + list(df.columns)[2:]
            marker_labels = zip(df['level_0'], df['level_1'])
            cols = ['RECS','HECTOR']
            if marker_shape:
                marker_shapes = df['level_0']
            if marker_color:
                marker_colors = df['level_1']
                marker_colors = [list(set(marker_colors)).index(i) for i in marker_colors.tolist()]
        if weighted_area:
            hector_weights = getattr(hector, slicer)['weights']
            hector_weights = hector_weights / max_house_count * max_marker_size
            if len(getattr(recs, slicer)[fields].shape) == 1:
                df = df.join(hector_weights, on=df.columns[0], how='left')
            elif len(getattr(recs, slicer)[fields].shape) == 2:
                try:
                    df = df.join(hector_weights, on=['level_1','level_0'], how='left')
                except ValueError:
                    hector_weights = pd.DataFrame(hector_weights.stack(),columns=['HouseCount'])
                    df = df.join(hector_weights, on=['level_0','level_1'], how='left')
        draw_scatter_plot(df, cols, marker_labels, slicer, weighted_area=weighted_area, setlims=setlims, marker_colors=marker_colors, marker_shapes=marker_shapes, size=size, marker_color_all=marker_color_all, show_labels=show_labels, leg_label=leg_label)
    if save:
        filename = os.path.basename(outputdb_filename)
        filename = os.path.join('saved images','Scatter_{}{}_{}.png'.format(fields, filename_adder,filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=200)
        trim_white(filename)

In [4]:
def myround(x, base, direction=None):
    if direction == 'up':
        return int(base * ceil(float(x)/base))
    elif direction == 'down':
        return int(base * floor(float(x)/base))
    else:
        return int(base * round(float(x)/base))
    
def get_bins(data, bin_size):
    return range(0, myround(data.max(), bin_size, 'up')+1, bin_size)

In [5]:
def draw_histograms(field='site', bin_size=1, save=False):
    linewidth = 2
    alpha = 0.7
    fig3 = plt.figure()
    try:
        hector_sims[field].plot(label='HECTOR', kind='hist', histtype='step', weights=hector_sims['HouseCountScaled'], 
                                            cumulative=False, normed=0, bins=get_bins(hector_sims[field],bin_size), figure=fig3,
                                            lw=linewidth, alpha=alpha, linestyle='dashed')
    except:
        pass
    rdata = getattr(recs,field)
    rdata.plot(label='RECS', kind='hist', histtype='step', weights=recs.weights,
                   cumulative=False, normed=0, bins=get_bins(rdata,bin_size), figure=fig3,
                   lw=linewidth, color='k', linestyle='solid', alpha=alpha)
    plt.gca().set_xlabel('{} Consumption (MBtu/yr)'.format(field.capitalize()))
    plt.gca().set_ylabel('Number of Homes')
    plt.legend()
    if save:
        filename = os.path.join('saved images','Hist_{}_{}.png'.format(field, outputdb_filename.replace('Output_','').replace('.sqlite','')))
        if not bin_size == 10:
            filename = filename.replace('.png',' (bin size={}).png'.format(bin_size))
        plt.savefig(filename, bbox_inches='tight', dpi=100)

In [6]:
def draw_cdfs(field='site', save=False, xlim=None):
    linewidth = 2
    alpha = 0.7
    fig = plt.figure()
    hector_sims[field].plot(label='ResStock', kind='hist', histtype='step', weights=hector_sims['HouseCountScaled'],
                                   cumulative=True, normed=0, bins=len(hector_sims[field]), figure=fig,
                                   lw=linewidth, alpha=alpha, linestyle='dashed')
    rdata = getattr(recs,field)
    rdata.plot(label='RECS', kind='hist', histtype='step', weights=recs.weights, 
               cumulative=True, normed=0, bins=len(rdata), figure=fig,
               lw=linewidth, color='k', linestyle='solid', alpha=alpha)
    plt.gca().set_xlabel('{} Consumption (MBtu/yr)'.format(field.capitalize()))
    plt.gca().set_ylabel('Cumulative Number of Homes')
    plt.legend()
    if xlim is not None:
        plt.xlim([0,xlim])
    if save:
        filename = os.path.join('saved images','CDF_{}_{}.png'.format(field, outputdb_filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=100)

In [7]:
def resample_slow(values, weights, size):
    # Slower Method
    from scipy.stats import rv_discrete  
    import numpy
    
    probabilities = weights * 1. / weights.sum()
    distrib = rv_discrete(values=(range(len(values)), probabilities))  # This defines a Scipy probability distribution
    return values[distrib.rvs(size=size)]

def resample(values, weights, size):
    import numpy as np
    from numpy.random import random_sample
    
    probabilities = weights * 1. / weights.sum()
    bins = np.add.accumulate(probabilities)
    return values[np.digitize(random_sample(size), bins)]

In [8]:
def facet_kdes(df, field, col, size=4):
    sns.set_context("poster")
    g = sns.FacetGrid(df, col=col, hue="Label",
                      size=size, aspect=0.7)
    g.map(sns.distplot, field, hist=False)
    plt.legend()

In [9]:
def facet_scatter_grid(df, field, hue='CR', col=None, row=None, save=False, lims=None):
    order_dict = {'Vintage':['pre-1950','1950s', '1960s','1970s','1980s','1990s','2000s'],
                      'CR':['CR02','CR03','CR04','CR05','CR06','CR07','CR08','CR09','CR10','CR11'],
                      'Size':['0-499','500-1499','1500-2499','2500-3499','3500-4499', '4500+'],
                      'Heating Fuel':['Natural Gas', 'Electricity','Fuel Oil', 'Propane/LPG', 'None']}
    if lims is None:
        lims = (0, max(df['RECS ' + field].max(), df['HECTOR ' + field].max()))
    
    num_markers = len(order_dict[hue])
    g = sns.FacetGrid(df, col=col, col_order=order_dict[col], row=row, row_order=order_dict[row], hue=hue, hue_order=order_dict[hue], palette=reversed(sns.color_palette("husl", num_markers+2)[:num_markers]), aspect=1, size=5, margin_titles=True,
                      hue_kws={"marker":(u'o', u'v', u'^', u'<', u'>', u'8', u's', u'p', u'*', u'h', u'H', u'D', u'd')})
    try:
        g.map(plt.errorbar, "RECS " + field, "HECTOR " + field, "RECS " + field + ' std', "HECTOR " + field + ' std', fmt='o', elinewidth=0.5, alpha=0.8)
    except KeyError:
        g.map(plt.scatter, "RECS " + field, "HECTOR " + field, alpha=.8, s=100)
    for ax in g.axes.flat:
        ax.plot(lims, lims, c=".2", ls="-", lw=0.5)
    g.set(xlim=lims, ylim=lims);
    try:
        g.add_legend()
    except IndexError:
        pass
    if save:
        filename = os.path.join('saved images','Facet_Grid_{}_{}_{}_{}_{}.png'.format(col, row, hue, field, outputdb_filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=200)

In [None]:
def facet_line_grid(df, field, hue='CR', col=None, row=None, save=False, lims=None):
    order_dict = {'Vintage':['pre-1950','1950s', '1960s','1970s','1980s','1990s','2000s'],
                      'CR':['CR02','CR03','CR04','CR05','CR06','CR07','CR08','CR09','CR10','CR11'],
                      'Size':['0-499','500-1499','1500-2499','2500-3499','3500-4499', '4500+'],
                      'Heating Fuel':['Natural Gas', 'Electricity','Fuel Oil', 'Propane/LPG', 'None']}
    if lims is None:
        lims = (0, max(df['RECS ' + field].max(), df['HECTOR ' + field].max()))
    
    num_markers = len(order_dict[hue])
    g = sns.FacetGrid(df, col=col, col_order=order_dict[col], row=row, row_order=order_dict[row], hue=hue, hue_order=order_dict[hue], palette=reversed(sns.color_palette("husl", num_markers+2)[:num_markers]), aspect=1, size=5, margin_titles=True,
                      hue_kws={"marker":(u'o', u'v', u'^', u'<', u'>', u'8', u's', u'p', u'*', u'h', u'H', u'D', u'd')})
    try:
        g.map(plt.errorbar, "RECS " + field, "HECTOR " + field, "RECS " + field + ' std', "HECTOR " + field + ' std', fmt='o', elinewidth=0.5, alpha=0.8)
    except KeyError:
        g.map(plt.scatter, "RECS " + field, "HECTOR " + field, alpha=.8, s=100)
    for ax in g.axes.flat:
        ax.plot(lims, lims, c=".2", ls="-", lw=0.5)
    g.set(xlim=lims, ylim=lims);
    try:
        g.add_legend()
    except IndexError:
        pass
    if save:
        filename = os.path.join('saved images','Facet_Grid_{}_{}_{}_{}_{}.png'.format(col, row, hue, field, outputdb_filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=200)

In [10]:
def facet_scatter_row(df, field, col, save=False):
    order_dict = {'Vintage':['pre-1950','1950s', '1960s','1970s','1980s','1990s','2000s'],
                      'CR':['CR02','CR03','CR04','CR05','CR06','CR07','CR08','CR09','CR10','CR11'],
                      'Size':['0-499','500-1499','1500-2499','2500-3499','3500-4499', '4500+']}
    lims = (0, max(df['RECS ' + field].max(), df['HECTOR ' + field].max()))
    g = sns.FacetGrid(df, col=col, col_order=order_dict[col], hue="Heating Fuel", aspect=1, size=5,
                      hue_kws={"marker":['o','s','v','^','>','<','+','x','D']})
    try:
        g.map(plt.errorbar, "RECS " + field, "HECTOR " + field, "RECS " + field + ' std', "HECTOR " + field + ' std', fmt='o', elinewidth=0.5, alpha=0.8)
    except KeyError:
        g.map(plt.scatter, "RECS " + field, "HECTOR " + field, alpha=.8, s=60)
    for ax in g.axes.flat:
        ax.plot(lims, lims, c=".2", ls="-", lw=0.5)
    g.set(xlim=lims, ylim=lims);
    try:
        g.add_legend()
    except IndexError:
        pass
    if save:
        filename = os.path.join('saved images','Facet_Row_{}_{}_{}.png'.format(col, field, outputdb_filename.replace('Output_','').replace('.sqlite','')))
        plt.savefig(filename, bbox_inches='tight', dpi=200)

In [6]:
def region_facet_lmplot(x,y,col='CR',hue='Heating Fuel',square=False,data=None): # data=recs.microdata
    grid = sns.lmplot(x,y,col=col, hue=hue, data=data, col_wrap=4, x_bins=10, x_ci=95)
    if square:
        lims = (0, max(data[x].max(), data[y].max()))
        for ax in grid.axes.flat:
            ax.plot(lims, lims, c=".2", ls="-", lw=0.5)
        grid.set(xlim=lims, ylim=lims);

NameError: name 'recs' is not defined

In [6]:
from PIL import Image
import numpy as np

def trim_white(filename):
    im = Image.open(filename)
    pix = np.asarray(im)

    pix = pix[:,:,0:3] # Drop the alpha channel
    idx = np.where(pix-255)[0:2] # Drop the color when finding edges
    box = map(min,idx)[::-1] + map(max,idx)[::-1]

    region = im.crop(box)
    region_pix = np.asarray(region)
    region.save(filename)

In [None]:
sns.set()

##Heat Maps

In [1]:
def heat_map(dfs, str_format='percent'):
    plt.rcParams['figure.figsize'] = 20, 16  # that's default image size for this interactive session
    
    formats = {'percent':'{0:.1%}',
               'sci':'{0:.1E}'}
    format_text = formats[str_format]
    
    for i, (title, df) in enumerate(dfs):
        plt.subplot(2, 2, i+1)
        m, n = df.shape
        ax = plt.imshow(df, interpolation='nearest', cmap='Oranges').get_axes()
        ax.set_xticks(np.linspace(0, n-1, n))
        ax.set_xticklabels(df.columns)
        ax.set_yticks(np.linspace(0, m-1, m))
        ax.set_yticklabels(df.index)
        ax.grid('off')
        ax.xaxis.tick_top()
        ax.set(aspect=0.5, adjustable='box-forced')
        plt.title(title, y=1.08, fontsize=20)

        # Print values
        for i in range(m):
            for j in range(n):
                ax.text(j, i, format_text.format(df.iget_value(i, j)),
                        size='large', ha='center', va='center',
                        path_effects=[patheffects.withStroke(linewidth=3,foreground="w")])
                        #path_effects=[patheffects.withSimplePatchShadow(shadow_rgbFace=(1,1,1))]) 