In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import torch
import json
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df_tr = pd.read_csv('train.csv') #need to split train_data into validation set later
predict_set = pd.read_csv('test_public.csv')

In [None]:
df_tr = df_tr.drop(df_tr[df_tr.MISSING_DATA == True].index)
print(df_tr.shape)

In [None]:
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1df_
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

# remove outliers
df_tr = df_tr.drop(df_tr[df_tr.LEN <= 300].index)
df_tr = df_tr.drop(df_tr[df_tr.LEN >= 10800].index)
print(df_tr.shape)
print(df_tr['LEN'].mean())

In [None]:
import json
import pandas as pd
from datetime import datetime
from pytz import timezone
from tzlocal import get_localzone
from dataclasses import dataclass
for df in (df_tr, predict_set): 
    df['start_time'] = pd.to_datetime(df['TIMESTAMP'], unit='s', utc=True)
    df['start_time'] = df['start_time'].dt.tz_convert('GMT')
    df['end_time'] = df['start_time'] + pd.to_timedelta(df_tr['LEN'], unit='s')
    df['dt'] = pd.to_datetime(df['start_time'].dt.date)
    df['t1'] = df['start_time'].dt.time
    df['t2'] = df['end_time'].dt.time
    df['day'] = df['start_time'].dt.dayofweek + 1

In [None]:
train = df_tr
import datetime
del df_tr
dates = pd.to_datetime(['2014-09-30', '2014-10-06', '2014-11-01', '2014-08-14', '2014-12-21'])

s = datetime.time(7, 30)
e = datetime.time(9, 30)
train.loc[(train['day']>=1) & (train['day']<=5) & (train['t1']<e) & (train['t2']>s), 'dt'] = dates[0]

s = datetime.time(16, 45)
e = datetime.time(18, 45)
train.loc[(train['day']>=1) & (train['day']<=5) & (train['t1']<e) & (train['t2']>s), 'dt'] = dates[1]

s = datetime.time(2, 30)
e = datetime.time(5, 0)
train.loc[(train['day']==6) & (train['t1']<e) & (train['t2']>s), 'dt'] = dates[2]

d = pd.to_datetime(['2013-08-14', '2013-12-24', '2014-04-17', '2014-04-24', '2014-04-30', '2014-06-09'])
s = datetime.time(16, 0)
e = datetime.time(20, 0)
train.loc[train['dt'].isin(d) & (train['t1']<e) & (train['t2']>s), 'dt'] = dates[3]

d = pd.to_datetime(['2013-08-11', '2013-12-08', '2013-12-22', '2013-12-29',  '2014-04-13', '2014-04-27', '2014-06-08'])
s = datetime.time(13, 30)
e = datetime.time(17, 15)
train.loc[train['dt'].isin(d) & (train['t1']<e) & (train['t2']>s), 'dt'] = dates[4]

train = train[train['dt'].isin(dates)]


del d, e, s

In [None]:
train.shape

In [None]:
d = pd.to_datetime(['2014-12-21'])
train[train['dt'].isin(d)].head(15)

In [None]:
train = train.drop(['DAY_TYPE','MISSING_DATA','start_time','end_time','t1','t2','day'],axis=1)
predict_set = predict_set.drop(['DAY_TYPE','MISSING_DATA','start_time','end_time','t1','t2','day'],axis=1)

In [None]:
train.head()

In [None]:
predict_set.head()

In [None]:
train = pd.get_dummies(train, columns=['dt'])
train.head()

In [None]:
predict_set = pd.get_dummies(predict_set,columns=['dt'])
predict_set.head()

In [None]:
"""
train=train.rename(columns={'dt_2014-08-14 00:00:00': 'dt_2014-08-14', 'dt_2014-09-30 00:00:00': 'dt_2014-09-30',
                            'dt_2014-10-06 00:00:00': 'dt_2014-10-06', 'dt_2014-11-01 00:00:00': 'dt_2014-11-01',
                            'dt_2014-12-21 00:00:00':'dt_2014-12-21'})
train.head()"""

In [None]:
for df in (train, predict_set):
    df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64[s]')
    datetime_index = pd.DatetimeIndex(df['TIMESTAMP'])
    df['QUARTER_HOUR'] = np.round(datetime_index.hour * 4 + datetime_index.minute / 15) % 96
    # Extract day of week
    df['DAY_OF_WEEK'] = datetime_index.dayofweek
    # Extract week of year
    df['WEEK_OF_YEAR'] = datetime_index.weekofyear - 1

In [None]:
def encode_feature(feature, train, test):
    """
    Encode the labels for the given feature across both the train and test datasets.
    """
    encoder = LabelEncoder()
    train_values = train[feature].copy()
    test_values = test[feature].copy()
    # Replace missing values with 0's so we can later encode them
    train_values[np.isnan(train_values)] = 0
    test_values[np.isnan(test_values)] = 0
    # Fit the labels across all possible values in both datasets
    encoder.fit(pd.concat([train_values, test_values]))
    # Add new column to the datasets with encoded values
    train[feature + '_ENCODED'] = encoder.transform(train_values)
    test[feature + '_ENCODED'] = encoder.transform(test_values)
    return encoder
client_encoder = encode_feature('ORIGIN_CALL', train, predict_set)
taxi_encoder = encode_feature('TAXI_ID', train, predict_set)
stand_encoder = encode_feature('ORIGIN_STAND', train, predict_set)

In [None]:
train.head()

In [None]:
predict_set.head()

In [None]:
metadata = {
    'n_quarter_hours': 96,  # Number of quarter of hours in one day (i.e. 24 * 4).
    'n_days_per_week': 7,
    'n_weeks_per_year': 52,
    'n_client_ids': len(client_encoder.classes_),
    'n_taxi_ids': len(taxi_encoder.classes_),
    'n_stand_ids': len(stand_encoder.classes_),
}

In [None]:
train_labels = train['LEN']

In [None]:
train, validation, train_labels, validation_labels = train_test_split(train, train_labels, test_size=0.02)
validation, test, validation_labels, test_labels = train_test_split(validation, validation_labels, test_size=0.5)

In [None]:
train_cache = 'cache/train2.pickle'
train_labels_cache = 'cache/train-labels2.npy'
validation_cache = 'cache/validation2.pickle'
validation_labels_cache = 'cache/validation-labels2.npy'
test_cache = 'cache/test2.pickle'
test_labels_cache = 'cache/test-labels2.npy'
competition_test_cache = 'cache/competition-test2.pickle'
metadata_cache = 'cache/metadata2.pickle'

In [None]:
train.to_pickle(train_cache)
validation.to_pickle(validation_cache)
test.to_pickle(test_cache)
np.save(train_labels_cache, train_labels)
np.save(validation_labels_cache, validation_labels)
np.save(test_labels_cache, test_labels)
predict_set.to_pickle(competition_test_cache)
with open(metadata_cache, 'wb') as handle:
    pickle.dump(metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_cache = 'cache/train2.pickle'
train = pd.read_pickle(train_cache)
train.head()

In [None]:
train.shape