In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14

import seaborn as sns
palette = sns.color_palette('Paired', 10)

# Set random seed 
RSEED = 100

In [139]:
# LOAD DATA TO BE MANIPULATED
source_file='01_Location/location_drift_total_02.csv'
target_file='01_Location/location_drift_total_predicted_02.csv'

test_data = pd.read_csv(source_file);

In [140]:
# Create Copy

test_data_predicted = test_data.copy()

In [141]:
def get_rf(data, columns):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    
    # Split data
    X_train, X_valid, y_train, y_valid = train_test_split(data, np.array(data['fare_amount']), 
                                                          stratify = data['fare-bin'], test_size=0.33,
                                                          random_state = RSEED)

    # Create the random forest        
    rf = RandomForestRegressor(n_estimators = 20, max_depth = 20, max_features = None, oob_score = True, 
                                bootstrap = True, verbose = 1, n_jobs = -1)

    # Train random forest
    column_list = []
    for column in columns:
        column_list.append(data.columns[column])
        
    rf.fit(X_train[column_list], y_train)
    
    return rf, column_list

In [142]:
def make_predictions(data, rf, column_list):

    preds = rf.predict(data[column_list])
    data['fare_amount'] = preds


    # Bin the fare and convert to string
    data['fare-bin'] = pd.cut(data['fare_amount'], bins = list(range(0, 50, 5))).astype(str)

    # Uppermost bin
    data.loc[data['fare-bin'] == 'nan', 'fare-bin'] = '[45+]'

    # Adjust bin so the sorting is correct
    data.loc[data['fare-bin'] == '(5.0, 10.0]', 'fare-bin'] = '(05.0, 10.0]'

In [143]:
# DEFINE Minovski Distance returning 1) Manhattan Distance (p1) and 2) Euclidean Distance (p2)
def minkowski_distance(x1, x2, y1, y2, p):
    return ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

# DEFINE Haversine distance - great circle distance, taking into account the spheric surface of the earth

# Radius of the earth in kilometers
R = 6378

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    
    source: https://stackoverflow.com/a/29546836

    """
    # Convert latitude and longitude to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Find the differences
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    # Apply the formula 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    # Calculate the angle (in radians)
    c = 2 * np.arcsin(np.sqrt(a))
    # Convert to kilometers
    km = R * c
    
    return km

# CREATE absolute difference in latitude and longitude
def absolute_differences(dataset):
    dataset['abs_lat_diff'] = (dataset['dropoff_latitude'] - dataset['pickup_latitude']).abs()
    dataset['abs_lon_diff'] = (dataset['dropoff_longitude'] - dataset['pickup_longitude']).abs()

# CREATE MANHATTEN: Calculate relative distances between rides
def calculate_manhatten_distance(dataset):
    dataset['manhattan'] = minkowski_distance(dataset['pickup_longitude'], dataset['dropoff_longitude'],
                                       dataset['pickup_latitude'], dataset['dropoff_latitude'], 1)
    
# CREATE EUCLIDEAN: Calculate relative distances between rides
def calculate_euclidean_distance(dataset):
    dataset['euclidean'] = minkowski_distance(dataset['pickup_longitude'], dataset['dropoff_longitude'],
                                       dataset['pickup_latitude'], dataset['dropoff_latitude'], 2)    

# CREATE Haversine distance
def calculate_haversine_distance(dataset):
    dataset['haversine'] =  haversine_np(dataset['pickup_longitude'], dataset['pickup_latitude'],
                         dataset['dropoff_longitude'], dataset['dropoff_latitude'])
    
def calculate_distances(dataset):
    absolute_differences(dataset);
    calculate_manhatten_distance(dataset);
    calculate_euclidean_distance(dataset);
    calculate_haversine_distance(dataset);

In [144]:
columns = [3, 4, 5, 6, 7, 12, 13, 14]

calculate_distances(test_data_predicted);
rf, column_list = get_rf(test_data_predicted, columns);
make_predictions(test_data_predicted, rf, column_list);
test_data_predicted.describe()

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished
  warn("Some inputs do not have OOB scores. "
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.1s finished


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0
mean,11.464511,-73.405764,41.168624,-73.784214,40.911038,1.525848,0.477235,0.620803,1.098038,0.928599,88.993813
std,8.315991,0.777091,0.707658,0.524023,0.482172,1.075825,0.721388,0.771115,1.160519,0.930521,90.74641
min,3.723034,-74.114971,40.605766,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.961751,-73.971405,40.758457,-73.986252,40.74482,1.0,0.009991,0.013535,0.035782,0.027429,2.712997
50%,8.629477,-73.936836,40.773994,-73.972313,40.761391,1.0,0.02932,0.144598,0.747171,0.680548,63.456275
75%,12.265787,-72.871257,41.363823,-73.948036,40.78093,2.0,0.830872,1.180048,1.96325,1.744995,164.230273
max,76.254749,-71.480958,43.247995,-71.485099,43.249845,5.0,2.624142,2.541841,4.927316,3.484203,345.932963


In [145]:
test_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0
mean,11.454982,-73.405764,41.168624,-73.784214,40.911038,1.525848,0.02124,0.023582,0.044822,0.034686,3.375043
std,9.02868,0.777091,0.707658,0.524023,0.482172,1.075825,0.023491,0.035598,0.052494,0.040288,3.743458
min,2.7446,-74.114971,40.605766,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.406104,-73.971405,40.758457,-73.986252,40.74482,1.0,0.006844,0.006172,0.016125,0.012629,1.237824
50%,8.546159,-73.936836,40.773994,-73.972313,40.761391,1.0,0.013702,0.012627,0.027596,0.021433,2.112695
75%,12.473678,-72.871257,41.363823,-73.948036,40.78093,2.0,0.026649,0.024117,0.050274,0.038433,3.891768
max,84.65871,-71.480958,43.247995,-71.485099,43.249845,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [146]:
# SAVE DATASETS

test_data_predicted.to_csv(target_file, index = False)