# Creation of the PickupHubSet
Note that in this I assume that the user_id is unique and consistent which I am not sure that it is. However as I don't really plan on using it, it is not really important

In [1]:
# Setup
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import os

%matplotlib inline

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (16,8)
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (16,8)


In [2]:

def split_into_seqs(idx):
    shifted_temp_idx1 = idx[:-1]
    shifted_temp_idx2 = idx[1:]
    split_idx = (shifted_temp_idx1+1 != shifted_temp_idx2).nonzero()[0]
    return np.split(ary=idx, indices_or_sections = split_idx+1)


In [3]:
# Loading Donkey data

# importing data
df_searchlog = pd.read_csv("../data/created_data_sets/cleaned_aggregated_searchlogs.csv", parse_dates=[-1])


In [4]:

# Loading ClimaStation data and select columns we want

df_climatedata = pd.read_csv("../data/ClimateStationData/CleanedClimateStationData.csv", parse_dates=[0])

# First we prepare the clima data we need
df_climatedata = df_climatedata.set_index('Time(utc)')

# We select a subset a columns we want
clima_cols = ['wind_dir_avg_sin',
              'wind_dir_avg_cos',
              'wind_speed_avg', 
              'air_temperature', 
              'rain_accumulation',
              'rain_duration', 
              'rain_intensity']
df_subclima = df_climatedata[clima_cols]



In [5]:
df_searchlog

Unnamed: 0,user_location_latitude,user_location_longitude,anonymous_id,user_id,timestamp
0,55.684639,12.553777,F6ADD77A-B26C-407B-BE83-78294FE80D0B,119568,2018-04-01 06:40:22.853
1,55.705648,12.542835,3B38FBCA-4DB1-44A4-9DF4-5979B29FD6EA,0,2018-04-01 07:06:18.953
2,55.667523,12.585956,4BDF423C-8F62-4DC5-B688-D84F1C92540F,0,2018-04-01 07:55:12.593
3,55.688612,12.562045,36062CE5-827F-4F68-B33D-F5CAAD6C5DFE,0,2018-04-01 08:06:36.777
4,55.688612,12.562045,36062CE5-827F-4F68-B33D-F5CAAD6C5DFE,119604,2018-04-01 08:10:51.799
...,...,...,...,...,...
622799,55.710006,12.568747,2cf6ffa8-ea31-4029-820f-4870ee31f120,318098,2019-03-30 13:12:21.072
622800,55.688521,12.559107,9db389dd-f77a-4ce5-bf58-da103ace406d,309748,2019-04-02 01:46:00.818
622801,55.672349,12.566592,dec9ea9d-555b-412b-b015-f616c51b19cd,346710,2019-04-02 03:49:21.847
622802,55.666342,12.553002,9eacda24-bb3a-4a70-a1d4-ed6457cb54d3,231313,2019-02-25 09:55:19.650


# Data wrangling

### Adding timevariables

#### Time variables

In [6]:
# Add time caovariates
df_searchlog['hour'] = df_searchlog.timestamp.apply(lambda x : x.hour + 1) # Note this is a ceil
#df_searchlog['pickup_day'] = df_searchlog.timestamp.apply(lambda x : x.day)
df_searchlog['dayofweek'] = df_searchlog.timestamp.apply(lambda x : x.dayofweek + 1) # +1 so not 0 index
df_searchlog['month'] = df_searchlog.timestamp.apply(lambda x: x.month + 1) # +1 so not 0 index


In [7]:
# Convert time variables using fourier transform (i think)
df_searchlog['hour_sin'] = np.sin(2*np.pi*df_searchlog.hour/24)
df_searchlog['hour_cos'] = np.cos(2*np.pi*df_searchlog.hour/24)
df_searchlog = df_searchlog.drop('hour', axis=1)

df_searchlog['dayofweeek_sin'] = np.sin(2*np.pi*df_searchlog.dayofweek/7)
df_searchlog['dayofweek_cos'] = np.cos(2*np.pi*df_searchlog.dayofweek/7)
df_searchlog = df_searchlog.drop('dayofweek', axis=1)

df_searchlog['month_sin'] = np.sin(2*np.pi*df_searchlog.month/12)
df_searchlog['month_cos'] = np.cos(2*np.pi*df_searchlog.month/12)
df_searchlog = df_searchlog.drop('month', axis=1)


# Merging dataframes

In [8]:
# Add ceiled created_at to match with climate data
df_searchlog['merge_date'] = df_searchlog.timestamp.dt.ceil('h')

In [9]:
# Merge together the clima data and the donkey data
full_df = df_searchlog.join(df_subclima, on='merge_date', how='inner')
#full_df = df_searchlog.merge(df_subclima, left_on='merge_date', right_index=True, how="inner")
full_df.reset_index(inplace=True, drop=True)


In [10]:
full_df

Unnamed: 0,user_location_latitude,user_location_longitude,anonymous_id,user_id,timestamp,hour_sin,hour_cos,dayofweeek_sin,dayofweek_cos,month_sin,month_cos,merge_date,wind_dir_avg_sin,wind_dir_avg_cos,wind_speed_avg,air_temperature,rain_accumulation,rain_duration,rain_intensity
0,55.684639,12.553777,F6ADD77A-B26C-407B-BE83-78294FE80D0B,119568,2018-04-01 06:40:22.853,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0
1,55.677467,12.590161,52CA1450-FB3C-4A84-990D-6C529E284683,118385,2018-04-01 06:38:35.298,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0
2,55.674647,12.559726,8A0521A3-0183-45F3-BF80-05DC01245205,112257,2018-04-01 06:49:25.505,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0
3,55.672892,12.556500,C042F1A5-F56E-4309-AEFC-C7FECC2FE953,119181,2018-04-01 06:06:27.769,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0
4,55.677467,12.590161,52CA1450-FB3C-4A84-990D-6C529E284683,118385,2018-04-01 06:47:39.060,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406937,55.706167,12.511590,c2d30124-f608-4ca2-90aa-16a8c8485244,321829,2019-01-28 03:18:34.256,0.866025,0.500000,7.818315e-01,0.623490,0.866025,5.000000e-01,2019-01-28 04:00:00,-0.614285,-0.789084,1.198,3.2220,0.0,0.0,0.0
406938,55.616830,12.568246,20bd6bf4-d9f8-40a4-8614-55bb00549ae2,290524,2019-02-06 03:27:17.111,0.866025,0.500000,4.338837e-01,-0.900969,1.000000,6.123234e-17,2019-02-06 04:00:00,-0.977046,-0.213030,2.607,2.4920,0.0,0.0,0.0
406939,55.679996,12.592800,f20deb1b-9ed2-4798-a777-888467c0a704,0,2019-02-05 00:14:28.898,0.258819,0.965926,9.749279e-01,-0.222521,1.000000,6.123234e-17,2019-02-05 01:00:00,-0.709571,-0.704634,4.698,1.7970,0.0,0.0,0.0
406940,55.681881,12.552261,b0a8faa5-d912-4ad8-b2c0-29344c4e7ec0,126297,2019-02-25 02:51:49.739,0.707107,0.707107,7.818315e-01,0.623490,1.000000,6.123234e-17,2019-02-25 03:00:00,-0.799685,0.600420,1.680,3.2880,0.0,0.0,0.0


## Add day to split test train on

In [11]:
start_day = full_df.timestamp.min()

In [12]:
full_df['obs_day'] = full_df.timestamp.apply(lambda x: (x-start_day).days)

In [13]:
full_df

Unnamed: 0,user_location_latitude,user_location_longitude,anonymous_id,user_id,timestamp,hour_sin,hour_cos,dayofweeek_sin,dayofweek_cos,month_sin,month_cos,merge_date,wind_dir_avg_sin,wind_dir_avg_cos,wind_speed_avg,air_temperature,rain_accumulation,rain_duration,rain_intensity,obs_day
0,55.684639,12.553777,F6ADD77A-B26C-407B-BE83-78294FE80D0B,119568,2018-04-01 06:40:22.853,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0,0
1,55.677467,12.590161,52CA1450-FB3C-4A84-990D-6C529E284683,118385,2018-04-01 06:38:35.298,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0,0
2,55.674647,12.559726,8A0521A3-0183-45F3-BF80-05DC01245205,112257,2018-04-01 06:49:25.505,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0,0
3,55.672892,12.556500,C042F1A5-F56E-4309-AEFC-C7FECC2FE953,119181,2018-04-01 06:06:27.769,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0,0
4,55.677467,12.590161,52CA1450-FB3C-4A84-990D-6C529E284683,118385,2018-04-01 06:47:39.060,0.965926,-0.258819,-2.449294e-16,1.000000,0.500000,-8.660254e-01,2018-04-01 07:00:00,0.986658,-0.162809,2.898,0.9117,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406937,55.706167,12.511590,c2d30124-f608-4ca2-90aa-16a8c8485244,321829,2019-01-28 03:18:34.256,0.866025,0.500000,7.818315e-01,0.623490,0.866025,5.000000e-01,2019-01-28 04:00:00,-0.614285,-0.789084,1.198,3.2220,0.0,0.0,0.0,302
406938,55.616830,12.568246,20bd6bf4-d9f8-40a4-8614-55bb00549ae2,290524,2019-02-06 03:27:17.111,0.866025,0.500000,4.338837e-01,-0.900969,1.000000,6.123234e-17,2019-02-06 04:00:00,-0.977046,-0.213030,2.607,2.4920,0.0,0.0,0.0,311
406939,55.679996,12.592800,f20deb1b-9ed2-4798-a777-888467c0a704,0,2019-02-05 00:14:28.898,0.258819,0.965926,9.749279e-01,-0.222521,1.000000,6.123234e-17,2019-02-05 01:00:00,-0.709571,-0.704634,4.698,1.7970,0.0,0.0,0.0,310
406940,55.681881,12.552261,b0a8faa5-d912-4ad8-b2c0-29344c4e7ec0,126297,2019-02-25 02:51:49.739,0.707107,0.707107,7.818315e-01,0.623490,1.000000,6.123234e-17,2019-02-25 03:00:00,-0.799685,0.600420,1.680,3.2880,0.0,0.0,0.0,330


# Save Dataframe

In [14]:
folder_path = "../data/created_data_sets"
file_name = "CoordinateSearchlog2.csv"
file_path = os.path.join(folder_path, file_name)

In [15]:
full_df.to_csv(file_path, index=None)

In [16]:
file_name = "CoordinateSearchlog3.csv"
file_path = os.path.join(folder_path, file_name)

In [20]:
full_df.loc[:100000,:].to_csv(file_path, index=None)