In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import torch
import json
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
CITY_CENTER    = np.array([[-8.615223, 41.157819]], ndmin=2)
stand_location = pd.read_csv('metaData_taxistandsID_name_GPSlocation.csv')
stand_location.head()

Unnamed: 0,ID,Descricao,Latitude,Longitude
0,1.0,Agra,41.177146,-8.60967
1,2.0,Alameda,41.15619,-8.591064
2,3.0,Aldoar,41.170525,-8.665876
3,4.0,Alfândega,41.143764,-8.621803
4,5.0,Amial,41.18351,-8.612726


In [3]:
df_tr = pd.read_csv('train.csv') #need to split train_data into validation set later
predict_set = pd.read_csv('test_public.csv')

In [4]:
df_tr = df_tr.drop(df_tr[df_tr.MISSING_DATA == True].index)
print(df_tr.shape)

(1710660, 9)


In [5]:
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1df_
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

# remove outliers
df_tr = df_tr.drop(df_tr[df_tr.LEN <= 60].index)
df_tr = df_tr.drop(df_tr[df_tr.LEN >= 7200].index)
print(df_tr.shape)

(1652895, 10)


In [None]:
"""
def polyline_start_long(polyline):
    path = json.loads(polyline)
    return path[0][0] #longitude

def polyline_start_lat(polyline):
    path = json.loads(polyline)
    return path[0][1] #latitude

def polyline_end_long(polyline):
    path = json.loads(polyline)
    return path[-1][0]

def polyline_end_lat(polyline):
    path = json.loads(polyline)
    return path[-1][1]

df_tr['START_LONG'] = df_tr['POLYLINE'].apply(polyline_start_long)
df_tr['START_LAT'] = df_tr['POLYLINE'].apply(polyline_start_lat)
df_tr['END_LONG'] = df_tr['POLYLINE'].apply(polyline_end_long)
df_tr['END_LAT'] = df_tr['POLYLINE'].apply(polyline_end_lat)

bounds = (  # Bounds retrieved using http://boundingbox.klokantech.com
    (41.052431, -8.727951),
    (41.257678, -8.456039)
)

#df_tr = df_tr.drop(df_tr[df_tr.END_LAT <= bounds[0][0]].index)
#df_tr = df_tr.drop(df_tr[df_tr.END_LAT >= bounds[1][0]].index)
#df_tr = df_tr.drop(df_tr[df_tr.END_LONG <= bounds[0][1]].index)
#df_tr = df_tr.drop(df_tr[df_tr.END_LONG >= bounds[1][1]].index)

# remove trips who end up too far from Porto

df_tr.head()
"""

In [6]:
print(df_tr.shape)

(1652895, 10)


In [None]:
"""
def np_haversine(lon1,lat1,lon2, lat2):
    """
    Numpy version of the Haversine function to calculate distances between two sets of points.
    Converted to Python from the R version provided in the competition's evaluation script.
    Returns the distance in km.
    """
    
    REarth = 6371
    lat = np.abs(lat1 - lat2) * np.pi / 180
    lon = np.abs(lon1 - lon2) * np.pi / 180
    lat1 = lat1 * np.pi / 180
    lat2 = lat2 * np.pi / 180
    a = np.sin(lat / 2) * np.sin(lat / 2) + np.cos(lat1) * np.cos(lat2) * np.sin(lon / 2) * np.sin(lon / 2)
    d = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return REarth * d

df_tr['DIST'] = df_tr[['START_LONG','START_LAT','END_LONG','END_LAT']].apply(lambda x: np_haversine(*x), axis=1)
df_tr.head()
"""

In [7]:
#Approximate starting positions with city center if ORIGIN_STAND==NaN
def stand_to_latitude(stand):
    if stand==0:
        return CITY_CENTER[0][1]
    return float(stand_location['Latitude'].loc[stand-1])
def stand_to_longitude(stand):
    if stand==0:
        return CITY_CENTER[0][0]
    return float(stand_location['Longitude'].loc[stand-1])

for df in (df_tr,predict_set):
    df['ORIGIN_STAND']=df['ORIGIN_STAND'].fillna(0)
    df['STAND_LATITUDE'] = df['ORIGIN_STAND'].apply(stand_to_latitude)
    df['STAND_LONGITUDE'] = df['ORIGIN_STAND'].apply(stand_to_longitude)
    df['STAND_LATITUDE'] = (df['STAND_LATITUDE'] - df['STAND_LATITUDE'].mean())/df['STAND_LATITUDE'].std()
    df['STAND_LONGITUDE'] = (df['STAND_LONGITUDE'] - df['STAND_LONGITUDE'].mean())/df['STAND_LONGITUDE'].std()

In [8]:
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,STAND_LATITUDE,STAND_LONGITUDE
0,1372636858620000589,C,,0.0,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,0.128228,-0.044821
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,0.404056,-1.900178
2,1372636951620000320,C,,0.0,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,0.128228,-0.044821
3,1372636854620000520,C,,0.0,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,0.128228,-0.044821
4,1372637091620000337,C,,0.0,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,0.128228,-0.044821


In [9]:
for df in (df_tr, predict_set):
    df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64[s]')
    datetime_index = pd.DatetimeIndex(df['TIMESTAMP'])
    df['QUARTER_HOUR'] = np.round(datetime_index.hour * 4 + datetime_index.minute / 15) % 96
    # Extract day of week
    df['DAY_OF_WEEK'] = datetime_index.dayofweek
    # Extract week of year
    df['WEEK_OF_YEAR'] = datetime_index.weekofyear - 1

  df['WEEK_OF_YEAR'] = datetime_index.weekofyear - 1
  df['WEEK_OF_YEAR'] = datetime_index.weekofyear - 1


In [11]:
df_tr = df_tr.drop(['MISSING_DATA','DAY_TYPE','TRIP_ID','TIMESTAMP'],axis=1)
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,STAND_LATITUDE,STAND_LONGITUDE,QUARTER_HOUR,DAY_OF_WEEK,WEEK_OF_YEAR
0,1372636858620000589,C,,0.0,20000589,2013-07-01 00:00:58,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,0.128228,-0.044821,0.0,0,26
1,1372637303620000596,B,,7.0,20000596,2013-07-01 00:08:23,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,0.404056,-1.900178,1.0,0,26
2,1372636951620000320,C,,0.0,20000320,2013-07-01 00:02:31,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,0.128228,-0.044821,0.0,0,26
3,1372636854620000520,C,,0.0,20000520,2013-07-01 00:00:54,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,0.128228,-0.044821,0.0,0,26
4,1372637091620000337,C,,0.0,20000337,2013-07-01 00:04:51,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,0.128228,-0.044821,0.0,0,26


In [12]:
predict_set = predict_set.drop(['MISSING_DATA','DAY_TYPE','TRIP_ID','TIMESTAMP'],axis=1)
predict_set.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,STAND_LATITUDE,STAND_LONGITUDE,QUARTER_HOUR,DAY_OF_WEEK,WEEK_OF_YEAR
0,B,,15.0,20000542,-1.12135,2.102446,72.0,3,32
1,B,,57.0,20000108,-1.527355,0.245767,71.0,3,32
2,B,,15.0,20000370,-1.12135,2.102446,71.0,3,32
3,B,,53.0,20000492,-2.156959,-0.00138,72.0,3,32
4,B,,18.0,20000621,-1.164146,-0.41946,72.0,3,32


In [13]:
def encode_feature(feature, train, test):
    """
    Encode the labels for the given feature across both the train and test datasets.
    """
    encoder = LabelEncoder()
    train_values = train[feature].copy()
    test_values = test[feature].copy()
    # Replace missing values with 0's so we can later encode them
    train_values[np.isnan(train_values)] = 0
    test_values[np.isnan(test_values)] = 0
    # Fit the labels across all possible values in both datasets
    encoder.fit(pd.concat([train_values, test_values]))
    # Add new column to the datasets with encoded values
    train[feature + '_ENCODED'] = encoder.transform(train_values)
    test[feature + '_ENCODED'] = encoder.transform(test_values)
    return encoder
client_encoder = encode_feature('ORIGIN_CALL', df_tr, predict_set)
taxi_encoder = encode_feature('TAXI_ID', df_tr, predict_set)
stand_encoder = encode_feature('ORIGIN_STAND', df_tr, predict_set)

In [14]:
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,STAND_LATITUDE,STAND_LONGITUDE,QUARTER_HOUR,DAY_OF_WEEK,WEEK_OF_YEAR,ORIGIN_CALL_ENCODED,TAXI_ID_ENCODED,ORIGIN_STAND_ENCODED
0,1372636858620000589,C,,0.0,20000589,2013-07-01 00:00:58,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,0.128228,-0.044821,0.0,0,26,0,366,0
1,1372637303620000596,B,,7.0,20000596,2013-07-01 00:08:23,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,0.404056,-1.900178,1.0,0,26,0,370,7
2,1372636951620000320,C,,0.0,20000320,2013-07-01 00:02:31,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,0.128228,-0.044821,0.0,0,26,0,203,0
3,1372636854620000520,C,,0.0,20000520,2013-07-01 00:00:54,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,0.128228,-0.044821,0.0,0,26,0,329,0
4,1372637091620000337,C,,0.0,20000337,2013-07-01 00:04:51,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,0.128228,-0.044821,0.0,0,26,0,216,0


In [15]:
predict_set.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,STAND_LATITUDE,STAND_LONGITUDE,QUARTER_HOUR,DAY_OF_WEEK,WEEK_OF_YEAR,ORIGIN_CALL_ENCODED,TAXI_ID_ENCODED,ORIGIN_STAND_ENCODED
0,B,,15.0,20000542,-1.12135,2.102446,72.0,3,32,0,341,15
1,B,,57.0,20000108,-1.527355,0.245767,71.0,3,32,0,78,57
2,B,,15.0,20000370,-1.12135,2.102446,71.0,3,32,0,238,15
3,B,,53.0,20000492,-2.156959,-0.00138,72.0,3,32,0,308,53
4,B,,18.0,20000621,-1.164146,-0.41946,72.0,3,32,0,392,18


In [None]:
#CREATE LABELS for distance and trip length

In [16]:
metadata = {
    'n_quarter_hours': 96,  # Number of quarter of hours in one day (i.e. 24 * 4).
    'n_days_per_week': 7,
    'n_weeks_per_year': 52,
    'n_client_ids': len(client_encoder.classes_),
    'n_taxi_ids': len(taxi_encoder.classes_),
    'n_stand_ids': len(stand_encoder.classes_),
}

In [17]:
train_labels = df_tr['LEN']

In [20]:
train, validation, train_labels, validation_labels = train_test_split(df_tr, train_labels, test_size=0.02)
validation, test, validation_labels, test_labels = train_test_split(validation, validation_labels, test_size=0.5)

In [23]:
train_cache = 'cache/train.pickle'
train_labels_cache = 'cache/train-labels.npy'
validation_cache = 'cache/validation.pickle'
validation_labels_cache = 'cache/validation-labels.npy'
test_cache = 'cache/test.pickle'
test_labels_cache = 'cache/test-labels.npy'
competition_test_cache = 'cache/competition-test.pickle'
metadata_cache = 'cache/metadata.pickle'

In [24]:
train.to_pickle(train_cache)
validation.to_pickle(validation_cache)
test.to_pickle(test_cache)
np.save(train_labels_cache, train_labels)
np.save(validation_labels_cache, validation_labels)
np.save(test_labels_cache, test_labels)

predict_set.to_pickle(competition_test_cache)
with open(metadata_cache, 'wb') as handle:
    pickle.dump(metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#Don't use

In [None]:
train_labels_dist = df_tr['DIST']
df_tr = df_tr.drop(['DIST'],axis=1)

In [None]:
train, validation, train_labels_dist, validation_labels_dist = train_test_split(df_tr, train_labels_dist, test_size=0.02)
validation, test, validation_labels_dist, test_labels_dist = train_test_split(validation, validation_labels_dist, test_size=0.5)

In [None]:
train.to_pickle(train_cache)
validation.to_pickle(validation_cache)
test.to_pickle(test_cache)
np.save(train_labels_len_cache, train_labels_len)
np.save(train_labels_dist_cache, train_labels_dist)

np.save(validation_labels_len_cache, validation_labels_len)
np.save(validation_labels_dist_cache, validation_labels_dist)

np.save(test_labels_len_cache, test_labels_len)
np.save(test_labels_dist_cache, test_labels_dist)

predict_set.to_pickle(competition_test_cache)
with open(metadata_cache, 'wb') as handle:
    pickle.dump(metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_cache = 'cache/train.pickle'
train_labels_len_cache = 'cache/train-labels-len.npy'
train_labels_dist_cache = 'cache/train-labels-dist.npy'
validation_cache = 'cache/validation.pickle'
validation_labels_len_cache = 'cache/validation-labels-len.npy'
validation_labels_dist_cache = 'cache/validation-labels-dist.npy'
test_cache = 'cache/test.pickle'
test_labels_len_cache = 'cache/test-labels-len.npy'
test_labels_dist_cache = 'cache/test-labels-dist.npy'
competition_test_cache = 'cache/competition-test.pickle'
metadata_cache = 'cache/metadata.pickle'

In [None]:
train = pd.read_pickle(train_cache)
validation = pd.read_pickle(validation_cache)
test = pd.read_pickle(test_cache)
train_labels = np.load(train_labels_cache)
validation_labels = np.load(validation_labels_cache)
test_labels = np.load(test_labels_cache)
competition_test = pd.read_pickle(competition_test_cache)
with open(metadata_cache, 'rb') as handle:
    metadata = pickle.load(handle)