In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14

import seaborn as sns
palette = sns.color_palette('Paired', 10)

# Set random seed 
RSEED = 100

In [2]:
# LOAD DATA TO BE MANIPULATED

data = pd.read_csv("../00_Data_Sets/100_k_dataset_2015.csv");

In [3]:
# Create Copy

data_manipulated = data.copy()

In [4]:
# OPTIONAL: SET DATE TO DATETIME FORMAT

data_manipulated['pickup_datetime'] = pd.to_datetime(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.set_index(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.sort_index()

In [17]:
# Split Dataset into drift steps
data_2015_jan_1_temp = data_manipulated['2015-01-01':'2015-01-05']
data_2015_jan_2_temp = data_manipulated['2015-01-06':'2015-01-31']
data_2015_feb_temp = data_manipulated['2015-02-01':'2015-04-30']
data_2015_may_temp = data_manipulated['2015-05-01':'2015-06-01']

data_2015_jan_1 = data_2015_jan_1_temp.copy()
data_2015_jan_2 = data_2015_jan_2_temp.copy()
data_2015_feb = data_2015_feb_temp.copy()
data_2015_may = data_2015_may_temp.copy()

In [18]:
from random import random, uniform, sample

def find_indices_for_drift(data):    
    drift_indices = []
    
    for index, row in data.iterrows():
        if row['haversine'] < 2:
            drift_indices.append(index)
    
    return drift_indices

def generate_outlier_coordinates():
    pickup_longitude = uniform(-73.98,-73.99)
    pickup_latitude = uniform(40.73,40.75)
    dropoff_longitude = uniform(-73.96,-73.98)
    dropoff_latitude = uniform(40.75,40.76)
    
    return pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude;
    
def update_sample(drift_level, dataset, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude):
    random_float = random()
    if random_float > (1 - drift_level):
        dataset.loc[index, 'pickup_longitude'] = pickup_longitude
        dataset.loc[index, 'pickup_latitude'] = pickup_latitude
        dataset.loc[index, 'dropoff_longitude'] = dropoff_longitude
        dataset.loc[index, 'dropoff_latitude'] = dropoff_latitude

In [24]:
#drift_indices_1 = find_indices_for_drift(data_2015_jan_1);
drift_indices_2 = find_indices_for_drift(data_2015_jan_2);
drift_indices_3 = find_indices_for_drift(data_2015_feb);
drift_indices_4 = find_indices_for_drift(data_2015_may);

#for index in drift_indices_1:
#    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
#    update_sample(0.4, data_2015_jan_1, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_2:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.9, data_2015_jan_2, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_3:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.7, data_2015_feb, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_4:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.3, data_2015_may, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);

In [25]:
# CREATE absolute difference column in latitude and longitude
# data_manipulated['abs_lat_diff'] = (data_manipulated['dropoff_latitude'] - data_manipulated['pickup_latitude']).abs()
# data_manipulated['abs_lon_diff'] = (data_manipulated['dropoff_longitude'] - data_manipulated['pickup_longitude']).abs()

In [26]:
# DEFINE Minovski Distance returning 1) Manhattan Distance (p1) and 2) Euclidean Distance (p2)
def minkowski_distance(x1, x2, y1, y2, p):
    return ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

In [27]:
# Defining Haversine distance - great circle distance, taking into account the spheric surface of the earth

# Radius of the earth in kilometers
R = 6378

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    
    source: https://stackoverflow.com/a/29546836

    """
    # Convert latitude and longitude to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Find the differences
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    # Apply the formula 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    # Calculate the angle (in radians)
    c = 2 * np.arcsin(np.sqrt(a))
    # Convert to kilometers
    km = R * c
    
    return km

In [28]:
# CREATE MANHATTEN: Calculate relative distances between rides
def calculate_manhatten_distance(dataset):
    dataset['manhattan'] = minkowski_distance(dataset['pickup_longitude'], dataset['dropoff_longitude'],
                                       dataset['pickup_latitude'], dataset['dropoff_latitude'], 1)
    
# CREATE EUCLIDEAN: Calculate relative distances between rides
def calculate_euclidean_distance(dataset):
    dataset['euclidean'] = minkowski_distance(dataset['pickup_longitude'], dataset['dropoff_longitude'],
                                       dataset['pickup_latitude'], dataset['dropoff_latitude'], 2)    

# CREATING Haversine distance
def calculate_haversine_distance(dataset):
    dataset['haversine'] =  haversine_np(dataset['pickup_longitude'], dataset['pickup_latitude'],
                         dataset['dropoff_longitude'], dataset['dropoff_latitude'])


calculate_manhatten_distance(data_2015_jan_1)
calculate_manhatten_distance(data_2015_jan_2)
calculate_manhatten_distance(data_2015_feb)
calculate_manhatten_distance(data_2015_may)

calculate_euclidean_distance(data_2015_jan_1)
calculate_euclidean_distance(data_2015_jan_2)
calculate_euclidean_distance(data_2015_feb)
calculate_euclidean_distance(data_2015_may)

calculate_haversine_distance(data_2015_jan_1)
calculate_haversine_distance(data_2015_jan_2)
calculate_haversine_distance(data_2015_feb)
calculate_haversine_distance(data_2015_may)

In [34]:
# data_2015_jan_1[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
# data_2015_jan_2[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
# data_2015_feb[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
data_2015_may[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,haversine
count,17390.0,17390.0,17390.0,17390.0,17390.0
mean,-73.974928,40.747522,-73.970741,40.751808,3.762561
std,0.03665,0.026469,0.034345,0.030731,3.755222
min,-74.177849,40.60495,-74.183258,40.540001,0.0
25%,-73.989455,40.734775,-73.987577,40.741013,1.772492
50%,-73.983326,40.746403,-73.97493,40.754848,2.568887
75%,-73.972376,40.763962,-73.962826,40.764594,4.06539
max,-73.776382,40.889828,-73.70517,40.963894,32.418389


In [32]:
# data_2015_jan_1_temp[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
data_2015_jan_2_temp[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
# data_2015_feb_temp[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()
# data_2015_may_temp[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,haversine
count,13905.0,13905.0,13905.0,13905.0,13905.0
mean,-73.974502,40.75131,-73.974327,40.751738,3.208145
std,0.0359,0.026969,0.033223,0.03052,3.589491
min,-74.186302,40.604408,-74.186302,40.575096,0.0
25%,-73.991814,40.737148,-73.991364,40.736122,1.191038
50%,-73.98185,40.75457,-73.980263,40.754772,2.03047
75%,-73.967796,40.768421,-73.963982,40.769157,3.663081
max,-73.532593,40.85738,-73.532593,41.005966,32.31477


In [35]:
# Merge Sub-Datasets

drifted_data = pd.concat([data_2015_jan_1,data_2015_jan_2, data_2015_feb, data_2015_may], ignore_index=True)
drifted_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0
mean,11.454982,-73.976423,40.745117,-73.969681,40.752185,1.525848,0.02124,0.023582,0.050958,0.038973,3.805766
std,9.02868,0.034553,0.024545,0.031766,0.028477,1.075825,0.023491,0.035598,0.049455,0.038065,3.51502
min,2.7446,-74.177849,40.470993,-74.239204,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.406104,-73.988813,40.733934,-73.982559,40.747746,1.0,0.006844,0.006172,0.028202,0.021483,2.130205
50%,8.546159,-73.984165,40.742975,-73.972419,40.755122,1.0,0.013702,0.012627,0.035989,0.027008,2.682323
75%,12.473678,-73.977333,40.758854,-73.963006,40.759926,2.0,0.026649,0.024117,0.050581,0.038249,3.868295
max,84.65871,-73.719551,40.970356,-73.587708,41.057537,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [52]:
data_manipulated.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0
mean,12.898841,-73.974159,40.750822,-73.973617,40.751591,1.526672,0.021339,0.02376,0.045099,0.034898,3.394644
std,10.835837,0.036963,0.027705,0.034712,0.031731,1.075492,0.023589,0.035833,0.052817,0.040527,3.764279
min,2.5,-74.291595,40.470993,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.99205,40.736729,-73.991379,40.73574,1.0,0.00687,0.006187,0.016201,0.012678,1.242527
50%,9.5,-73.981918,40.753759,-73.980019,40.754349,1.0,0.01376,0.012672,0.027721,0.021516,2.120602
75%,14.5,-73.967588,40.768227,-73.963333,40.769345,2.0,0.02673,0.024231,0.050611,0.038623,3.907561
max,100.0,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [36]:
# SAVE DATASETS

# data_2015_jan_1.to_csv('03_Distance/distance_drift_jan_1_01.csv', index = False)
# data_2015_jan_2.to_csv('03_Distance/distance_drift_jan_2_01.csv', index = False)
# data_2015_feb.to_csv('03_Distance/distance_drift_feb_01.csv', index = False)
# data_2015_may.to_csv('03_Distance/distance_drift_may_01.csv', index = False)
drifted_data.to_csv('03_Distance/distance_drift_total_02.csv', index = False)

In [55]:
data_check_jan = pd.read_csv('02_Passenger Count/passenger count_drift_jan_01.csv');
data_check_feb = pd.read_csv('02_Passenger Count/passenger count_drift_feb_01.csv');
data_check_mar = pd.read_csv('02_Passenger Count/passenger count_drift_mar_01.csv');
data_check_apr = pd.read_csv('02_Passenger Count/passenger count_drift_apr_01.csv');
data_check_apr.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0
mean,13.147689,-73.974358,40.750763,-73.973634,40.751618,1.749896,0.021834,0.024233,0.046067,0.035636,3.467226
std,10.988736,0.036788,0.02785,0.034339,0.031787,1.091114,0.023995,0.036289,0.053451,0.04107,3.817332
min,2.5,-74.186302,40.586666,-74.218719,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.992134,40.736412,-73.991295,40.736198,1.0,0.007122,0.006378,0.01664,0.013004,1.283343
50%,9.5,-73.981972,40.753849,-73.979729,40.754513,1.0,0.01405,0.012978,0.028336,0.021901,2.162592
75%,15.0,-73.967476,40.768356,-73.963654,40.769314,2.0,0.027382,0.024666,0.052189,0.039848,4.033403
max,98.75,-73.776184,40.909973,-73.587708,41.043755,5.0,0.277458,0.296982,0.396683,0.302198,30.891908
