In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14

import seaborn as sns
palette = sns.color_palette('Paired', 10)

# Set random seed 
RSEED = 100

In [10]:
# LOAD DATA TO BE MANIPULATED

data = pd.read_csv("../00_Data_Sets/100_k_dataset_2015.csv");

In [11]:
# Create Copy

data_manipulated = data.copy()

In [4]:
# OPTIONAL: SET DATE TO DATETIME FORMAT

data_manipulated['pickup_datetime'] = pd.to_datetime(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.set_index(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.sort_index()

In [6]:
# Split Dataset into drift steps
data_2015_jan_1_temp = data_manipulated['2015-01-01':'2015-01-15']
data_2015_jan_2_temp = data_manipulated['2015-01-16':'2015-01-31']
data_2015_feb_temp = data_manipulated['2015-02-01':'2015-04-30']
data_2015_may_temp = data_manipulated['2015-05-01':'2015-06-01']

data_2015_jan_1 = data_2015_jan_1_temp.copy()
data_2015_jan_2 = data_2015_jan_2_temp.copy()
data_2015_feb = data_2015_feb_temp.copy()
data_2015_may = data_2015_may_temp.copy()

In [8]:
from random import random, uniform, sample

def find_indices_for_drift(data):    
    drift_indices = []
    
    for index, row in data.head(20).iterrows():
        if row['haversine'] < 2:
            drift_indices.append(index)
    
    return drift_indices

def generate_outlier_coordinates():
    pickup_longitude = uniform(-74.2,-74.9)
    pickup_latitude = uniform(37.9,40.4)
    dropoff_longitude = uniform(-72.9,-73.2)
    dropoff_latitude = uniform(40.1,42.6)
    
    return pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude;
    
def update_sample(drift_level, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude):
    random_float = random()
    if random_float > (1 - drift_level):
        data_manipulated['pickup_longitude'][index] = pickup_longitude
        data_manipulated['pickup_latitude'][index] = pickup_latitude
        data_manipulated['dropoff_longitude'][index] = dropoff_longitude
        data_manipulated['dropoff_latitude'][index] = dropoff_latitude

In [None]:
drift_indices_1 = find_indices_for_drift(data_2015_jan_1);
drift_indices_2 = find_indices_for_drift(data_2015_jan_2);
drift_indices_3 = find_indices_for_drift(data_2015_feb);
drift_indices_4 = find_indices_for_drift(data_2015_may);

for index in drift_indices_1:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.2, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_2:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.9, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_3:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.8, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);
    
for index in drift_indices_4:
    pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude = generate_outlier_coordinates();
    update_sample(0.7, index, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude);

In [43]:
# Create Data Drift

# Start-Long
drift_data(data_2015_jan_1, 'pickup_longitude', -73.98, 0.5)
drift_data(data_2015_jan_2, 'pickup_longitude', -73.98, 0.9)
drift_data(data_2015_feb, 'pickup_longitude', -73.98, 0.9)
drift_data(data_2015_may, 'pickup_longitude', -73.98, 0.7)

# Start-Lat
drift_data(data_2015_jan_1, 'pickup_latitude', 40.75, 0.3)
drift_data(data_2015_jan_2, 'pickup_latitude', 40.75, 0.8)
drift_data(data_2015_feb, 'pickup_latitude', 40.75, 0.9)
drift_data(data_2015_may, 'pickup_latitude', 40.75, 0.5)

# End-Long
drift_data(data_2015_jan_1, 'dropoff_longitude', -73.98, 0.05)
drift_data(data_2015_jan_2, 'dropoff_longitude', -73.98, 0.1)
drift_data(data_2015_feb, 'dropoff_longitude', -73.98, 0.3)
drift_data(data_2015_may, 'dropoff_longitude', -73.98, 0.5)

# End-Lat
drift_data(data_2015_jan_1, 'dropoff_latitude', 40.75, 0.05)
drift_data(data_2015_jan_2, 'dropoff_latitude', 40.75, 0.1)
drift_data(data_2015_feb, 'dropoff_latitude', 40.75, 0.3)
drift_data(data_2015_may, 'dropoff_latitude', 40.75, 0.5)

In [61]:
# data_2015_jan_1.describe()
# data_2015_jan_2.describe()
# data_2015_feb.describe()
data_2015_may[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,17390.0,17390.0,17390.0,17390.0
mean,-73.503391,41.021376,-73.661465,41.018033
std,0.741385,0.606554,0.639158,0.598828
min,-74.064796,40.60495,-74.291611,40.540001
25%,-73.975914,40.751247,-73.979841,40.751541
50%,-73.954697,40.765785,-73.962563,40.766701
75%,-73.158632,40.791804,-73.840668,40.801714
max,-71.481947,43.242846,-71.482467,43.24275


In [54]:
data_2015_jan_1_temp.describe()
# data_2015_jan_2_temp.describe()
# data_2015_feb_temp.describe()
# data_2015_may_temp.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0,7862.0
mean,12.044651,-73.973518,40.751521,-73.974219,40.7522,1.522132,0.02054,0.023013,0.043553,0.033665,3.273233
std,10.326657,0.037078,0.027212,0.03298,0.030648,1.064373,0.023298,0.035121,0.05219,0.03993,3.713134
min,2.5,-74.017899,40.604408,-74.182556,40.575096,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.991407,40.737835,-73.991264,40.736844,1.0,0.006639,0.006159,0.015641,0.012267,1.192593
50%,9.0,-73.98156,40.755245,-73.980007,40.755344,1.0,0.013103,0.012436,0.026596,0.020701,2.026225
75%,13.0,-73.967428,40.768007,-73.963991,40.769217,2.0,0.025442,0.023413,0.047794,0.036563,3.715469
max,92.75,-73.532593,40.867558,-73.532593,41.023895,5.0,0.253963,0.304008,0.495525,0.350499,34.818911


In [51]:
# Merge Sub-Datasets

drifted_data = pd.concat([data_2015_jan_1,data_2015_jan_2, data_2015_feb, data_2015_may], ignore_index=True)
drifted_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0
mean,12.81297,-73.417152,41.15526,-73.785292,40.911448,1.525848,0.02124,0.023582,0.044822,0.034686,3.375043
std,10.765255,0.77119,0.701693,0.521857,0.482379,1.075825,0.023491,0.035598,0.052494,0.040288,3.743458
min,2.5,-74.183128,40.599318,-74.291611,40.540001,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.971924,40.75782,-73.986336,40.744904,1.0,0.006844,0.006172,0.016125,0.012629,1.237824
50%,9.5,-73.942833,40.773594,-73.972305,40.76141,1.0,0.013702,0.012627,0.027596,0.021433,2.112695
75%,14.5,-72.896732,41.29665,-73.948029,40.780994,2.0,0.026649,0.024117,0.050274,0.038433,3.891768
max,100.0,-71.481947,43.249069,-71.482467,43.247347,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [52]:
data_manipulated.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0
mean,12.898841,-73.974159,40.750822,-73.973617,40.751591,1.526672,0.021339,0.02376,0.045099,0.034898,3.394644
std,10.835837,0.036963,0.027705,0.034712,0.031731,1.075492,0.023589,0.035833,0.052817,0.040527,3.764279
min,2.5,-74.291595,40.470993,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.99205,40.736729,-73.991379,40.73574,1.0,0.00687,0.006187,0.016201,0.012678,1.242527
50%,9.5,-73.981918,40.753759,-73.980019,40.754349,1.0,0.01376,0.012672,0.027721,0.021516,2.120602
75%,14.5,-73.967588,40.768227,-73.963333,40.769345,2.0,0.02673,0.024231,0.050611,0.038623,3.907561
max,100.0,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [62]:
# SAVE DATASETS

data_2015_jan_1.to_csv('01_Location/location_drift_jan_1_01.csv', index = False)
data_2015_jan_2.to_csv('01_Location/location_drift_jan_2_01.csv', index = False)
data_2015_feb.to_csv('01_Location/location_drift_feb_01.csv', index = False)
data_2015_may.to_csv('01_Location/location_drift_may_01.csv', index = False)
drifted_data.to_csv('01_Location/location_drift_total.csv', index = False)

In [55]:
data_check_jan = pd.read_csv('02_Passenger Count/passenger count_drift_jan_01.csv');
data_check_feb = pd.read_csv('02_Passenger Count/passenger count_drift_feb_01.csv');
data_check_mar = pd.read_csv('02_Passenger Count/passenger count_drift_mar_01.csv');
data_check_apr = pd.read_csv('02_Passenger Count/passenger count_drift_apr_01.csv');
data_check_apr.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0
mean,13.147689,-73.974358,40.750763,-73.973634,40.751618,1.749896,0.021834,0.024233,0.046067,0.035636,3.467226
std,10.988736,0.036788,0.02785,0.034339,0.031787,1.091114,0.023995,0.036289,0.053451,0.04107,3.817332
min,2.5,-74.186302,40.586666,-74.218719,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.992134,40.736412,-73.991295,40.736198,1.0,0.007122,0.006378,0.01664,0.013004,1.283343
50%,9.5,-73.981972,40.753849,-73.979729,40.754513,1.0,0.01405,0.012978,0.028336,0.021901,2.162592
75%,15.0,-73.967476,40.768356,-73.963654,40.769314,2.0,0.027382,0.024666,0.052189,0.039848,4.033403
max,98.75,-73.776184,40.909973,-73.587708,41.043755,5.0,0.277458,0.296982,0.396683,0.302198,30.891908
