In [1]:
import argparse
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool

In [None]:
def main(filename, x_axis, y_axis, color_by=None, num_points=None):
        
    ### read in data, sampling if necessary
    if num_points: 
        num_lines_file = sum(1 for line in open(filename)) - 1 #subtract 1 for the header
        skip = sorted(random.sample(xrange(1,num_lines_file+1), num_lines_file - num_points))   #select lines to read randomly
        df = pd.read_csv(filename, header = 0, sep = ',',
                        skiprows = skip)
        
    else: df = pd.read_csv(filename, header = 0, sep = ',')
    
    ### drop rows with null    
    
    ### create urls if not already there

    ### create colors if requested
    if color_by: 
	palette = ["#053061", "#2166ac", "#4393c3", "#92c5de", "#d1e5f0",
           "#f7f7f7", "#fddbc7", "#f4a582", "#d6604d", "#b2182b", "#67001f"]
        scaler = MinMaxScaler()
        color_by_normed = scaler.fit_transform(df[color_by].values.reshape(-1,1)*1.0)
	color_indices = [int(10*i) for i in color_by_normed]
        df['colors'] = [palette[i] for i in color_indices]

    ### create title
        
    ### start plotting
    p = figure(title=title, x_axis_label=x_axis, y_axis_label=y_axis) 
    p.title.text_font_size = '20pt'

    source = ColumnDataSource(
            data=dict(
                x= df.loc[:, x_axis],
                y= df.loc[:, y_axis],
                desc= df.loc[:, 'url'].apply(lambda x: x.strip('.png').split('/')[-2:]),
                imgs = df.loc[:, 'url'],
		colors = df.loc[:, 'colors']
            )
        )
    
    hover = HoverTool(
            tooltips="""
            <div>
                <div>
                    <img
                        src="@imgs" height="200" alt="@imgs" width="200"
                        style="float: left; margin: 0px 15px 15px 0px;"
                        border="2"
                    ></img>
                </div>
                <div>
                    <span style="font-size: 17px; font-weight: bold;">@desc</span>
                    <span style="font-size: 15px; color: #966;">[$index]</span>
                </div>
                <div>
                    <span>@fonts{safe}</span>
                </div>
                <div>
                    <span style="font-size: 15px;">Location</span>
                    <span style="font-size: 10px; color: #696;">($x, $y)</span>
                </div>
            </div>
            """
        )

    #TOOLS='pan, wheel_zoom, reset, save, hover'

    p = figure(plot_width=800, plot_height=800, tools=[hover, 'pan', 'wheel_zoom', 'reset', 'save'],
               title=title, x_axis_label=x_axis, y_axis_label=y_axis)
    
    p.circle('x', 'y', size=10, source=source, fill_color='colors')
    
    show(p)
    
    return df

In [None]:
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='description')

    parser.add_argument('filename', type=str,
                        help='Please provide a path to the csv')

    parser.add_argument('x_dim', type=str,
                        help='Value to plot on the x-axis')
    
    parser.add_argument('y_dim', type=str,
                        help='Value to plot on the y-axis')
    
    parser.add_argument('--color_by', type=str, nargs='?',
                        help='Color data points by a 3rd variable')
    
    parser.add_argument('--n_rows', type=int, nargs='?' ,
                        help='Number of rows to sample for illustration. Default uses all rows')

    args = parser.parse_args()
    #print args

    main(args.filename, 
         x_axis = args.x_dim, y_axis = args.y_dim, 
         color_by=args.color_by, 
         num_points=args.n_rows)