In [357]:
import pandas as pd
import numpy as np
import mysql.connector as msc
import datetime as dt
from collections import Counter
from bokeh.io import output_file, show, output_notebook
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)
from sklearn.cluster import KMeans
import random

In [79]:
engine = msc.connect(user='root', password='asdfghjkl;',
                              host='127.0.0.1',
                              database='rideaustin')
df = pd.read_sql('SELECT start_location_lat,start_location_long, created_date,tod FROM rides  WHERE status = "NO_AVAILABLE_DRIVER" OR status = "Completed";', engine)

In [428]:
df['day_of_week'] = df['created_date'].dt.weekday_name
df = df[(df['start_location_lat'] >= 30.190833) & (df['start_location_lat'] <= 30.404041)]
df = df[(df['start_location_long'] >=-97.819014) & (df['start_location_long'] <= -97.647192)]

In [202]:
def period(row):
    '''
    To be .apply() to dataframe with 'created_date' column. Takes in a row and creates a new column that
    assigns that row to a particular 30 minute timeblock.
    '''
    timelables = list(range(0, 49))
    timevalues = []
    for x in list(range(0,25)):
        timevalues.append((x,0))
        timevalues.append((x,30))
    periods = dict(zip(timelables, timevalues))
    visit_start = {'hour': row.created_date.hour, 'min': row.created_date.minute} # get hour, min of visit start
    for label, tupe in periods.items():
        hour = tupe[0]
        thirty = tupe[1]
        if hour == visit_start['hour']:
            if thirty <= visit_start['min'] <= thirty+30:
                return label
            else:
                return label+1


In [323]:
df['period'] = df.apply(period, axis=1)

In [329]:
df.values[0]

array([38.676306, -121.038976, Timestamp('2016-06-03 21:14:40'),
       Timedelta('0 days 21:14:40'), 'Friday', 42], dtype=object)

In [232]:
dftest = df[(df['start_location_lat'] >= 30.252) & (df['start_location_lat'] <= 30.258)]
dftest = dftest[(dftest['start_location_long'] <= -97.760) & (dftest['start_location_long'] >= -97.766)]
#30.255456, -97.761994
len(dftest)

20489

# map this to see if it makes sense for two points within the same area

In [207]:
def create_centroids(dataframe):
    ''' 
    Takes a dataframe of my start_location_lats and start_location_longs and builds a K-Means model with 5 centroids.
    It returns a numpy array of the centroids (by lat-long pair) and a dictionary where the key is the centroid rank 
    and the value is a list of the [lat,long,# of datapoints, rank] for that centroid.
    
    INPUT:
    - Dataframe
    OUTPU:
    - numpy array
    - dictionary'''
    
    X = np.array(dataframe[['start_location_lat','start_location_long']])
    model = KMeans(n_clusters=5)
    model.fit(X)
    cents = model.cluster_centers_
    lables_model = model.labels_
    c = Counter(lables_model)
    centroids_by_intensity = c.most_common(5)
    ordered_labels = [i for i,x in centroids_by_intensity]
    ordered_centroids = []
    centroid_dict = {}

    for i, index in enumerate(ordered_labels):
        ordered_centroids.append(cents[index])
        centroid_dict[i] = [cents[index][0],cents[index][1],centroids_by_intensity[i][1],i]
    
    print np.array(ordered_centroids)
    print centroid_dict
    return np.array(ordered_centroids), centroid_dict 


In [208]:
def plot_Austin_centroids(centroids, centroid_dictionary,num_datapoints, completed_rides=None, unfulfilled_rides=None):
    '''
    Takes in centroid values from create_centroids() and centroid_dictionary and plots the centroids relative to their
    intensity. Optional inputs for the lat-long columns for completed_rides (green) and unfulfilled_rides(blue).
    
    INPUT:
    - centroids (numpy array)
    - centroid_dict (dictionary)
    - copmleted_rides (dataframe)
    - unfulfilled_rides (dataframe)
    
    OUTPUT:
    -None
    '''
    #creating the plot
    map_options = GMapOptions(lat=30.29, lng=-97.73, map_type="roadmap", zoom=11)

    plot = GMapPlot(
        x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options
    )
    plot.title.text = "Austin"
    plot.api_key = "AIzaSyBx-cLXm4jxpg0aX_nnUnwd2hir3Ve0j9w"
    
    #create alpha based on intensity
    alpha = []
    for key, value in centroid_dictionary.iteritems():
        al_value = value[2]/float(num_datapoints)
        al_fixed = al_value+.25
        alpha.insert(key,al_fixed)
    
    #try if completed_rides is populated
    try:
        completed_lats = list(completed_rides['start_location_lat'])
        completed_longs = list(completed_rides['start_location_long'])
        completed_source = ColumnDataSource( data=dict(
            lat=completed_lats,
            lon=completed_longs,
    )
)
        completed_dots = Circle(x="lon", y="lat", size=15, fill_color="green", fill_alpha=0.1, line_color=None)
        plot.add_glyph(completed_source, completed_dots)
    except:
        pass
    
    #try if unfulfilled_rides is populated
    try:
        unfulfilled_lats = list(unfulfilled_rides['start_location_lat'])
        unfulfilled_longs = list(unfulfilled_rides['start_location_long'])
        unfulfilled_source = ColumnDataSource(
        data=dict(
            lat=unfulfilled_lats,
            lon=unfulfilled_longs,

        )
    )
        unfulfilled_dots = Circle(x="lon", y="lat", size=15, fill_color="blue", fill_alpha=0.8, line_color=None)
        plot.add_glyph(unfulfilled_source, unfulfilled_dots)
    except:
        pass
    
    #creating centroid source and circle
    centroidlats = centroids[:,0]
    centroidlongs = centroids[:,1]
    print centroidlats
    centroid_source = ColumnDataSource(
        data=dict(
            lat=centroidlats, 
            lon=centroidlongs,
             alpha=alpha
        )
    )
    print alpha
    centroid_dots = Circle(x="lon", y="lat", size=45, fill_color='#8B008B', fill_alpha='alpha', line_color=None)
    plot.add_glyph(centroid_source, centroid_dots)
    
    #finishing the plot
    plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
    show(plot)

In [233]:
df_centroids, cent_dict = create_centroids(dftest)
plot_Austin_centroids(centroid_dictionary=cent_dict, centroids=df_centroids,num_datapoints=len(dftest),completed_rides=dftest)

[[ 30.25599169 -97.76301378]
 [ 30.25450931 -97.76216012]
 [ 30.25279069 -97.76430475]
 [ 30.25648119 -97.76102745]
 [ 30.25615182 -97.76518634]]
{0: [30.255991687245352, -97.763013784901986, 7460, 0], 1: [30.254509313056509, -97.762160123051117, 6314, 1], 2: [30.252790690687014, -97.764304752428217, 3421, 2], 3: [30.256481187498967, -97.761027450607386, 2494, 3], 4: [30.256151818842987, -97.765186342359655, 800, 4]}
[ 30.25599169  30.25450931  30.25279069  30.25648119  30.25615182]
[0.6140978085802138, 0.5581653570208405, 0.41696764117331253, 0.3717238518229294, 0.2890453414027039]


# Defining 'Similar' pairs:
- lat and long are within .006 of each other. 
- Weekdays are equal
- timeblock within 1 of each other.

In [393]:
def find_similar_pairs(dataframe,row):
    '''
    Takes in a dataframe to test against and a ride request. Populates a distribution of other rides that
    could follow that particular ride. 
    
    NOTE: row must be a ndarray in this order:
    ['period','day_of_week', 'start_location_lat','start_location_long']
     
    input:
    -original dataframe
    -row (ndarray)
    output:
    -list of possible new points to sample from (Dataframe)
    '''
    dataframe = dataframe[['period','day_of_week', 'start_location_lat','start_location_long']]
#     row2 = row2[['period','day_of_week', 'start_location_lat','start_location_long']].values
    following_rides_list = []
    dataframe = dataframe.values
    for i, request in enumerate(dataframe):
        if row1[0] <= request[0] <= row1[0]+1 and row1[1] == request[1] and row1[2]-.003 <= request[2] <= row1[2]+.003\
        and row1[3]-.003 <= request[3] <= row1[3]+.003:
#             request2 = dataframe[ind+1]
#             if row2[0]-1 <= request2[0] <= row2[0]+1:# and row2[1] == request2[1] \
#             #and row2[2]-.006 <= request2[2] <= row2[2]+.006\
#             #and row2[3]-.006 <= request2[3] <= row2[3]+.006:
            request2 = dataframe[i+1]
            following_rides_list.append(request2)
    return pd.DataFrame(following_rides_list, columns=['period','day_of_week', 'start_location_lat','start_location_long'])

In [394]:
row1 = dftemp[['period','day_of_week', 'start_location_lat','start_location_long']].iloc[9].values
possible_followups = find_similar_pairs(df,row1)

In [431]:
len(possible_followups)

1412

In [379]:
samples = pd.DataFrame(possible_followups, columns=['period','day_of_week','start_location_lat','start_location_long'])

In [386]:
possible_followups.describe()

Unnamed: 0,period,start_location_lat,start_location_long
count,1412.0,1412.0,1412.0
mean,8.5,30.205988,-97.534432
std,0.503003,1.394855,4.502218
min,8.0,0.0,-98.004072
25%,8.0,30.254734,-97.751113
50%,8.0,30.266425,-97.742559
75%,9.0,30.283002,-97.731037
max,10.0,30.541167,0.0


In [387]:
sample_cents, sample_dict = create_centroids(samples)

[[  3.02646336e+01  -9.77441346e+01]
 [  3.03561137e+01  -9.77321122e+01]
 [  3.02099452e+01  -9.76685450e+01]
 [  3.02047889e+01  -9.78486791e+01]
 [  3.55271368e-15   0.00000000e+00]]
{0: [30.264633639286096, -97.744134647736388, 1184, 0], 1: [30.356113685021437, -97.732112248025601, 138, 1], 2: [30.20994520777348, -97.668544994838328, 57, 2], 3: [30.204788922488017, -97.848679131626113, 30, 3], 4: [3.5527136788005009e-15, 0.0, 3, 4]}


In [388]:
plot_Austin_centroids(centroid_dictionary=sample_dict, centroids=sample_cents, num_datapoints=len(samples), completed_rides=samples)

[  3.02646336e+01   3.03561137e+01   3.02099452e+01   3.02047889e+01
   3.55271368e-15]
[1.0885269121813033, 0.34773371104815864, 0.29036827195467424, 0.27124645892351273, 0.2521246458923513]


In [444]:
def predict_rides(input_dataframe,row,n_rides=100):
    '''
    Takes in the most recent ride request and predicts the next half-hour worth of ride quests using
    find_similar_pairs method.
    
    Inputs:
    dataframe,
    row,
    n_rides (to be predicted in separate function)
    
    output:
    Dataframe (predicted ride requests)'''
    
    predicted= []
    for rep in xrange(n_rides):
        distribution = find_similar_pairs(input_dataframe,row)
        sample = distribution.sample().values[0].tolist()
        predicted.append(sample)
        row=sample
    return pd.DataFrame(predicted, columns=['period','day_of_week','start_location_lat','start_location_long'])
        

In [448]:
predictions = predict_rides(df,row1,n_rides=100)

In [449]:
predictions

Unnamed: 0,period,day_of_week,start_location_lat,start_location_long
0,8,Saturday,30.247687,-97.750092
1,8,Saturday,30.268810,-97.741741
2,8,Saturday,30.258760,-97.738616
3,9,Saturday,30.284904,-97.743470
4,9,Saturday,30.295504,-97.741758
5,9,Saturday,30.239783,-97.728038
6,9,Saturday,30.309670,-97.751848
7,8,Saturday,30.269832,-97.749145
8,8,Saturday,30.243907,-97.727401
9,8,Saturday,30.253820,-97.763311


In [451]:
predicted_centroids, predicted_dict = create_centroids(predictions)

[[ 30.26388925 -97.74136236]
 [ 30.2927438  -97.74258642]
 [ 30.23730771 -97.76030457]
 [ 30.3481335  -97.7485315 ]
 [ 30.20699521 -97.68507423]]
{0: [30.263889253659023, -97.741362356027636, 48, 0], 1: [30.292743795689226, -97.742586415711543, 30, 1], 2: [30.237307714285713, -97.760304571428577, 14, 2], 3: [30.348133499999999, -97.748531499999999, 4, 3], 4: [30.206995213763886, -97.685074230435021, 4, 4]}


In [453]:
plot_Austin_centroids(predicted_centroids,predicted_dict,len(predicted_df),unfulfilled_rides=predictions)

[ 30.26388925  30.2927438   30.23730771  30.3481335   30.20699521]
[5.05, 3.25, 1.65, 0.65, 0.65]
