In [1]:
%matplotlib inline
import os
import math
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import tensorflow as tf
from functools import partial

from pandas.plotting import scatter_matrix

from sklearn.externals import joblib

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

  from ._conv import register_converters as _register_converters


In [2]:
TAXIFARE_PATH = os.path.join("datasets", "NYCityTaxiFare")
FILE_NAME = "train.csv"
SIZE_CHUNK = 10*(10**6)
R = 6371

In [3]:
def random_batch(X_train, y_train, batch_size):
    rnd_index = np.random.randint(0, len(X_train), batch_size)
    X_batch = X_train[rnd_index, :]
    y_batch = y_train.values[rnd_index]
    return X_batch, y_batch

In [4]:
def load_taxifare_data(taxifare_path=TAXIFARE_PATH, file_name = FILE_NAME, size_chunk = SIZE_CHUNK):
    csv_path = os.path.join(taxifare_path, file_name)
    return pd.read_csv(csv_path, nrows = size_chunk)

In [5]:
def rmse(estimator, data_set, lables):
    lables_pred = estimator.predict(data_set)
    mse = mean_squared_error(lables, lables_pred)
    rmse = np.sqrt(mse)
    return rmse

In [6]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = R * c
    
    return km

In [7]:
BB = (-74.5, -71, 40, 42)
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')

BB_zoom = (-74.1, -73.7, 40.2, 42)
nyc_map_zoom = plt.imread('https://github.com/WillKoehrsen/Machine-Learning-Projects/blob/master/images/nyc_-74.1_-73.7_40.6_40.85.PNG?raw=true')

In [8]:
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2, color = False):
    fig, axs = plt.subplots(1, 2, figsize=(18, 22))
    axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    axs[0].axis('off')
    axs[0].imshow(nyc_map, zorder=0, extent=BB)

    axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='b', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    axs[1].axis('off')
    axs[1].imshow(nyc_map, zorder=0, extent=BB)

In [9]:
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # define bounding box
    BB = (-74.5, -71, 40.2, 42)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[idx]

In [10]:
def minkowski_distance(x1, x2, y1, y2, p):
    return ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

In [11]:
taxifare_load = load_taxifare_data()

In [12]:
taxifare = taxifare_load.copy()

In [13]:
taxifare = taxifare.loc[taxifare["passenger_count"].between(1, 6)]

In [14]:
taxifare = taxifare.loc[taxifare["fare_amount"].between(2.5, 100)]

In [15]:
taxifare = taxifare.loc[taxifare["pickup_longitude"].between(-74.3, -72)]
taxifare = taxifare.loc[taxifare["dropoff_longitude"].between(-74.3, -72)]

In [16]:
taxifare = taxifare.loc[taxifare["pickup_latitude"].between(40.3, 42.5)]
taxifare = taxifare.loc[taxifare["dropoff_latitude"].between(40.3, 42.5)]

In [17]:
print('Old size: %d' % len(taxifare))
taxifare = remove_datapoints_from_water(taxifare)
print('New size: %d' % len(taxifare))

Old size: 9749181
New size: 9749121


In [18]:
taxifare["distance_km"] = haversine_np(taxifare['pickup_longitude'], taxifare['pickup_latitude'],
                         taxifare['dropoff_longitude'], taxifare['dropoff_latitude'])

In [19]:
taxifare = taxifare.loc[taxifare["distance_km"].between(0.3, 100)]

In [20]:
taxifare["abs_lat_diff"] = (taxifare["dropoff_latitude"] - taxifare["pickup_latitude"]).abs()
taxifare["abs_lon_diff"] = (taxifare["dropoff_longitude"] - taxifare["pickup_longitude"]).abs()

In [21]:
taxifare = taxifare.loc[taxifare["abs_lat_diff"] > 0]
taxifare = taxifare.loc[taxifare["abs_lon_diff"] > 0]

In [22]:
taxifare["manhattan"] = minkowski_distance(taxifare["pickup_longitude"], taxifare["dropoff_longitude"],
                                       taxifare["pickup_latitude"], taxifare["dropoff_latitude"], 1)

taxifare["euclidean"] = minkowski_distance(taxifare["pickup_longitude"], taxifare["dropoff_longitude"],
                                       taxifare["pickup_latitude"], taxifare["dropoff_latitude"], 2)

In [23]:
columns = ["distance_km"]

#"key", "pickup_datetime", "pickup_latitude", "dropoff_latitude"
#"dropoff_longitude", "pickup_longitude", "abs_lat_diff", "abs_lon_diff", "euclidean", "manhattan"


In [24]:
X = taxifare[columns]
y = taxifare["fare_amount"]

In [25]:
pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])

In [26]:
X = pipeline.fit_transform(X)

In [27]:
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size = 20000, random_state = 42)

# MODEL ZONE

# DNN MODEL ZONE

In [28]:
tf.reset_default_graph()

In [29]:
n_inputs = len(columns)
n_hidden1 = 40
n_hidden2 = 10
#n_hidden3 = 150
#n_hidden4 = 50
#n_hidden5 = 20
#n_hidden6 = 10
n_outputs = 1

In [30]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name="training")

In [31]:
he_init = tf.contrib.layers.variance_scaling_initializer()

my_batch_norm_layer = partial(tf.layers.batch_normalization, training = training, momentum = 0.9)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.relu, kernel_initializer = he_init)
    bn1 = my_batch_norm_layer(hidden1)
    bn1_act = tf.nn.elu(bn1)
    
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2", activation=tf.nn.elu, kernel_initializer = he_init)
    bn2 = my_batch_norm_layer(hidden2)
    bn2_act = tf.nn.elu(bn2)
    
#    hidden3 = tf.layers.dense(bn2_act, n_hidden3, name="hidden3", activation=tf.nn.elu, kernel_initializer = he_init)
#    bn3 = my_batch_norm_layer(hidden3)
#    bn3_act = tf.nn.elu(bn3)
    
#    hidden4 = tf.layers.dense(bn3_act, n_hidden4, name="hidden4", activation=tf.nn.relu, kernel_initializer = he_init)
 #   bn4 = my_batch_norm_layer(hidden4)
 #   bn4_act = tf.nn.elu(bn4)
    
#    hidden5 = tf.layers.dense(bn4_act, n_hidden5, name="hidden5", activation=tf.nn.elu, kernel_initializer = he_init)
#    bn5 = my_batch_norm_layer(hidden5)
#    bn5_act = tf.nn.elu(bn5)
    
#    hidden6 = tf.layers.dense(bn5_act, n_hidden6, name="hidden6", activation=tf.nn.elu, kernel_initializer = he_init)
#    bn6 = my_batch_norm_layer(hidden6)
#    bn6_act = tf.nn.elu(bn6)
    
    output = tf.layers.dense(bn2_act, n_outputs, name="output")

In [32]:
with tf.name_scope("loss"):
    mse = tf.losses.mean_squared_error(labels=y, predictions=output)
    loss = tf.sqrt(mse, name="loss")

In [33]:
learning_rate = 0.001
momentum = 0.9
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    training_op = optimizer.minimize(loss)

In [34]:
with tf.name_scope("eval"):
    mse = tf.losses.mean_squared_error(labels=y, predictions=output)
    rmse = tf.sqrt(mse, name="rmse")

In [35]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [36]:
n_epochs = 10
batch_size = 10000
n_batches = int(np.ceil(X_train.shape[0]/ batch_size))

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

error = float("inf")

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(n_batches):
            X_batch, y_batch = random_batch(X_train, y_train, batch_size)
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        rmse_train = rmse.eval(feed_dict={X: X_batch, y: y_batch})
        rmse_test = rmse.eval(feed_dict={X: X_val, y: y_val})
        
        print(epoch, "Ошибка при обучении:", rmse_train,
                     "Ошибка при проверке:", rmse_test)
        
        if rmse_test < error:
            error = rmse_test
            save_path = saver.save(sess, "./taxifare/best_model_final2/best_model_final2.ckpt")
            print("Найдена лучшая модель на ", epoch+1, " итерации с ошибкой при проверке:", rmse_test)

0 Ошибка при обучении: 9.286522 Ошибка при проверке: 9.491141
Найдена лучшая модель на  1  итерации с ошибкой при проверке: 9.491141
1 Ошибка при обучении: 9.190817 Ошибка при проверке: 9.340859
Найдена лучшая модель на  2  итерации с ошибкой при проверке: 9.340859
2 Ошибка при обучении: 9.221015 Ошибка при проверке: 9.3438635
3 Ошибка при обучении: 9.474977 Ошибка при проверке: 9.352812
4 Ошибка при обучении: 9.296511 Ошибка при проверке: 9.340099
Найдена лучшая модель на  5  итерации с ошибкой при проверке: 9.340099
5 Ошибка при обучении: 9.411121 Ошибка при проверке: 9.341508
6 Ошибка при обучении: 9.239766 Ошибка при проверке: 9.340295
7 Ошибка при обучении: 9.307523 Ошибка при проверке: 9.340271
8 Ошибка при обучении: 9.32244 Ошибка при проверке: 9.3403015
9 Ошибка при обучении: 9.370398 Ошибка при проверке: 9.351635


In [None]:
TAXIFARE_PATH = os.path.join("datasets", "NYCityTaxiFare")
FILE_NAME = "test.csv"
csv_path = os.path.join(TAXIFARE_PATH, FILE_NAME)

X_test_final = pd.read_csv(csv_path)

In [None]:
X_test = X_test_final.copy()

In [None]:
#X_test["abs_lat_diff"] = (X_test["dropoff_latitude"] - X_test["pickup_latitude"]).abs()
#X_test["abs_lon_diff"] = (X_test["dropoff_longitude"] - X_test["pickup_longitude"]).abs()

#X_test = X_test.loc[X_test["abs_lat_diff"] > 0]
#X_test = X_test.loc[X_test["abs_lon_diff"] > 0]

X_test["distance_km"] = haversine_np(X_test['pickup_longitude'], X_test['pickup_latitude'],
                         X_test['dropoff_longitude'], X_test['dropoff_latitude'])

X_test["manhattan"] = minkowski_distance(X_test["pickup_longitude"], X_test["dropoff_longitude"],
                                       X_test["pickup_latitude"], X_test["dropoff_latitude"], 1)

X_test["euclidean"] = minkowski_distance(X_test["pickup_longitude"], X_test["dropoff_longitude"],
                                       X_test["pickup_latitude"], X_test["dropoff_latitude"], 2)

In [None]:
X_test = X_test[columns]
X_test = pipeline.fit_transform(X_test)

In [None]:
X_test.shape

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./taxifare/best_model_final2/best_model_final2.ckpt")
    y_pred = output.eval(feed_dict={X: X_test})

In [None]:
y_pred = np.round(y_pred, 2)

In [None]:
y_pred = y_pred.reshape(-1)

In [None]:
TAXIFARE_PATH = os.path.join("datasets", "NYCityTaxiFare")
csv_path = os.path.join(TAXIFARE_PATH, 'submission2.csv')

In [None]:
submission = pd.DataFrame(
    {'key': X_test_final.key, 'fare_amount': y_pred},
    columns = ['key', 'fare_amount'])
submission.to_csv(csv_path, index = False)