In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14

import seaborn as sns
palette = sns.color_palette('Paired', 10)

# Set random seed 
RSEED = 100

In [2]:
# LOAD DATA TO BE MANIPULATED

data = pd.read_csv("../00_Data_Sets/100_k_dataset_2015.csv");

In [3]:
# Create Copy

data_manipulated = data.copy()

In [4]:
# OPTIONAL: SET DATE TO DATETIME FORMAT

data_manipulated['pickup_datetime'] = pd.to_datetime(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.set_index(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.sort_index()

In [6]:
# Split Dataset into drift steps
data_2015_jan_1_temp = data_manipulated['2015-01-01':'2015-01-05']
data_2015_jan_2_temp = data_manipulated['2015-01-06':'2015-01-31']
data_2015_feb_temp = data_manipulated['2015-02-01':'2015-04-30']
data_2015_may_temp = data_manipulated['2015-05-01':'2015-06-01']

data_2015_jul = data_manipulated['2015-07-01':'2015-07-31'] # EMPTY

data_2015_jan_1 = data_2015_jan_1_temp.copy()
data_2015_jan_2 = data_2015_jan_2_temp.copy()
data_2015_feb = data_2015_feb_temp.copy()
data_2015_may = data_2015_may_temp.copy()

print('Dataset 1:',data_2015_jan_1.shape)
print('Dataset 2:',data_2015_jan_2.shape)
print('Dataset 3:',data_2015_feb.shape)
print('Dataset 4:',data_2015_may.shape)

print('Dataset 7:',data_2015_jul.shape) # EMPTY

Dataset 1: (2281, 15)
Dataset 2: (13905, 15)
Dataset 3: (49853, 15)
Dataset 4: (17390, 15)
Dataset 7: (0, 15)


In [7]:
from random import random

def drift_data(data, column, threshold, drift_level):
    data.loc[:,column] = data.loc[:,column].apply(change_data, args=[threshold, drift_level]);
    
def change_data(location, threshold, drift_level):
    random_float = random()
    random_float_location_add = (random() * 2.5)
    
    if location < threshold:
        if random_float > (1 - drift_level):
            return location + random_float_location_add

    return location
            

In [8]:
# Create Data Drift

# Start-Long
# drift_data(data_2015_jan_1, 'pickup_longitude', -73.98, 0.5)
drift_data(data_2015_jan_2, 'pickup_longitude', -73.98, 0.9)
drift_data(data_2015_feb, 'pickup_longitude', -73.98, 0.9)
drift_data(data_2015_may, 'pickup_longitude', -73.98, 0.7)

# Start-Lat
# drift_data(data_2015_jan_1, 'pickup_latitude', 40.75, 0.3)
drift_data(data_2015_jan_2, 'pickup_latitude', 40.75, 0.8)
drift_data(data_2015_feb, 'pickup_latitude', 40.75, 0.9)
drift_data(data_2015_may, 'pickup_latitude', 40.75, 0.5)

# End-Long
# drift_data(data_2015_jan_1, 'dropoff_longitude', -73.98, 0.05)
drift_data(data_2015_jan_2, 'dropoff_longitude', -73.98, 0.1)
drift_data(data_2015_feb, 'dropoff_longitude', -73.98, 0.3)
drift_data(data_2015_may, 'dropoff_longitude', -73.98, 0.5)

# End-Lat
# drift_data(data_2015_jan_1, 'dropoff_latitude', 40.75, 0.05)
drift_data(data_2015_jan_2, 'dropoff_latitude', 40.75, 0.1)
drift_data(data_2015_feb, 'dropoff_latitude', 40.75, 0.3)
drift_data(data_2015_may, 'dropoff_latitude', 40.75, 0.5)

In [14]:
# data_2015_jan_1.describe()
data_2015_jan_2.describe()
# data_2015_feb.describe()
# data_2015_may[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0
mean,11.083812,-73.353307,41.174466,-73.907198,40.802013,1.524056,0.020209,0.022404,0.042613,0.032966,3.208145
std,8.631789,0.793797,0.713841,0.324602,0.282156,1.080152,0.022561,0.034019,0.050397,0.038599,3.589491
min,3.27158,-74.065991,40.605766,-74.186302,40.575096,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.329454,-73.968437,40.758888,-73.990097,40.738869,1.0,0.006565,0.005966,0.015606,0.012248,1.191038
50%,8.384255,-73.870834,40.774162,-73.978043,40.756729,1.0,0.013123,0.012321,0.026455,0.020682,2.03047
75%,12.001717,-72.746707,41.386326,-73.958946,40.772816,2.0,0.025127,0.02317,0.047401,0.03642,3.663081
max,73.5185,-71.48507,43.24535,-71.496791,43.22629,5.0,0.271149,0.304008,0.408257,0.304479,32.31477


In [12]:
# data_2015_jan_1_temp.describe()
data_2015_jan_2_temp.describe()
# data_2015_feb_temp.describe()
# data_2015_may_temp.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0,13905.0
mean,11.083812,-73.974502,40.75131,-73.974327,40.751738,1.524056,0.020209,0.022404,0.042613,0.032966,3.208145
std,8.631789,0.0359,0.026969,0.033223,0.03052,1.080152,0.022561,0.034019,0.050397,0.038599,3.589491
min,3.27158,-74.186302,40.604408,-74.186302,40.575096,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.329454,-73.991814,40.737148,-73.991364,40.736122,1.0,0.006565,0.005966,0.015606,0.012248,1.191038
50%,8.384255,-73.98185,40.75457,-73.980263,40.754772,1.0,0.013123,0.012321,0.026455,0.020682,2.03047
75%,12.001717,-73.967796,40.768421,-73.963982,40.769157,2.0,0.025127,0.02317,0.047401,0.03642,3.663081
max,73.5185,-73.532593,40.85738,-73.532593,41.005966,5.0,0.271149,0.304008,0.408257,0.304479,32.31477


In [15]:
# Merge Sub-Datasets

drifted_data = pd.concat([data_2015_jan_1,data_2015_jan_2, data_2015_feb, data_2015_may], ignore_index=True)
drifted_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0,83429.0
mean,11.454982,-73.405764,41.168624,-73.784214,40.911038,1.525848,0.02124,0.023582,0.044822,0.034686,3.375043
std,9.02868,0.777091,0.707658,0.524023,0.482172,1.075825,0.023491,0.035598,0.052494,0.040288,3.743458
min,2.7446,-74.114971,40.605766,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.406104,-73.971405,40.758457,-73.986252,40.74482,1.0,0.006844,0.006172,0.016125,0.012629,1.237824
50%,8.546159,-73.936836,40.773994,-73.972313,40.761391,1.0,0.013702,0.012627,0.027596,0.021433,2.112695
75%,12.473678,-72.871257,41.363823,-73.948036,40.78093,2.0,0.026649,0.024117,0.050274,0.038433,3.891768
max,84.65871,-71.480958,43.247995,-71.485099,43.249845,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [16]:
data_manipulated.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0
mean,11.501462,-73.974159,40.750822,-73.973617,40.751591,1.526672,0.021339,0.02376,0.045099,0.034898,3.394644
std,9.072024,0.036963,0.027705,0.034712,0.031731,1.075492,0.023589,0.035833,0.052817,0.040527,3.764279
min,2.7446,-74.291595,40.470993,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.42054,-73.99205,40.736729,-73.991379,40.73574,1.0,0.00687,0.006187,0.016201,0.012678,1.242527
50%,8.563286,-73.981918,40.753759,-73.980019,40.754349,1.0,0.01376,0.012672,0.027721,0.021516,2.120602
75%,12.514358,-73.967588,40.768227,-73.963333,40.769345,2.0,0.02673,0.024231,0.050611,0.038623,3.907561
max,84.65871,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [17]:
# SAVE DATASETS

# data_2015_jan_1.to_csv('01_Location/location_drift_jan_1_01.csv', index = False)
# data_2015_jan_2.to_csv('01_Location/location_drift_jan_2_01.csv', index = False)
# data_2015_feb.to_csv('01_Location/location_drift_feb_01.csv', index = False)
# data_2015_may.to_csv('01_Location/location_drift_may_01.csv', index = False)
drifted_data.to_csv('01_Location/location_drift_total_02.csv', index = False)

In [55]:
data_check_jan = pd.read_csv('02_Passenger Count/passenger count_drift_jan_01.csv');
data_check_feb = pd.read_csv('02_Passenger Count/passenger count_drift_feb_01.csv');
data_check_mar = pd.read_csv('02_Passenger Count/passenger count_drift_mar_01.csv');
data_check_apr = pd.read_csv('02_Passenger Count/passenger count_drift_apr_01.csv');
data_check_apr.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0
mean,13.147689,-73.974358,40.750763,-73.973634,40.751618,1.749896,0.021834,0.024233,0.046067,0.035636,3.467226
std,10.988736,0.036788,0.02785,0.034339,0.031787,1.091114,0.023995,0.036289,0.053451,0.04107,3.817332
min,2.5,-74.186302,40.586666,-74.218719,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.992134,40.736412,-73.991295,40.736198,1.0,0.007122,0.006378,0.01664,0.013004,1.283343
50%,9.5,-73.981972,40.753849,-73.979729,40.754513,1.0,0.01405,0.012978,0.028336,0.021901,2.162592
75%,15.0,-73.967476,40.768356,-73.963654,40.769314,2.0,0.027382,0.024666,0.052189,0.039848,4.033403
max,98.75,-73.776184,40.909973,-73.587708,41.043755,5.0,0.277458,0.296982,0.396683,0.302198,30.891908
