In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14

import seaborn as sns
palette = sns.color_palette('Paired', 10)

# Set random seed 
RSEED = 100

In [2]:
# LOAD DATA TO BE MANIPULATED

data = pd.read_csv("../00_Data_Sets/100_k_dataset_2015.csv");

In [3]:
# Create Copy

data_manipulated = data.copy()

In [4]:
# OPTIONAL: SET DATE TO DATETIME FORMAT

data_manipulated['pickup_datetime'] = pd.to_datetime(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.set_index(data_manipulated['pickup_datetime'])
data_manipulated = data_manipulated.sort_index()

In [5]:
# Split Dataset into drift steps
data_2015_jan = data_manipulated['2015-01-01':'2015-01-05']
data_2015_feb = data_manipulated['2015-01-06':'2015-02-28']
data_2015_mar = data_manipulated['2015-03-01':'2015-03-31']
data_2015_apr = data_manipulated['2015-04-01':'2015-04-30']
data_2015_may = data_manipulated['2015-05-01':'2015-05-31']
data_2015_jun = data_manipulated['2015-06-01':'2015-06-30']
data_2015_jul = data_manipulated['2015-07-01':'2015-07-31'] # EMPTY

print('Dataset 1:',data_2015_jan.shape)
print('Dataset 2:',data_2015_feb.shape)
print('Dataset 3:',data_2015_mar.shape)
print('Dataset 4:',data_2015_apr.shape)
print('Dataset 5:',data_2015_may.shape)
print('Dataset 6:',data_2015_jun.shape)
print('Dataset 7:',data_2015_jul.shape) # EMPTY

Dataset 1: (2281, 15)
Dataset 2: (29871, 15)
Dataset 3: (17030, 15)
Dataset 4: (16857, 15)
Dataset 5: (16874, 15)
Dataset 6: (16031, 15)
Dataset 7: (0, 15)


In [6]:
from random import random

def drift_data(data, drift_level):
    data.loc[:,'passenger_count'] = data.loc[:,'passenger_count'].apply(change_data, args=[drift_level]);
    
def change_data(passenger_count, drift_level):
    random_float = random()
    random_integer_passenger_count = int(random() * 2)
    
    if passenger_count < 3:
        if random_float > (1 - drift_level):
            return passenger_count + random_integer_passenger_count

    return passenger_count
            

In [14]:
# Create Data Drift

drift_data(data_2015_jan, 0.0)
drift_data(data_2015_feb, 0.8)
drift_data(data_2015_mar, 0.8)
drift_data(data_2015_apr, 0.8)

In [23]:
#data_2015_jan.describe()
#data_2015_feb.describe()
#data_2015_mar.describe()
data_2015_apr.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0
mean,11.666791,-73.974358,40.750763,-73.973634,40.751618,2.074806,0.021834,0.024233,0.046067,0.035636,3.467226
std,9.18474,0.036788,0.02785,0.034339,0.031787,1.070007,0.023995,0.036289,0.053451,0.04107,3.817332
min,2.7446,-74.186302,40.586666,-74.218719,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.498166,-73.992134,40.736412,-73.991295,40.736198,1.0,0.007122,0.006378,0.01664,0.013004,1.283343
50%,8.64756,-73.981972,40.753849,-73.979729,40.754513,2.0,0.01405,0.012978,0.028336,0.021901,2.162592
75%,12.769786,-73.967476,40.768356,-73.963654,40.769314,3.0,0.027382,0.024666,0.052189,0.039848,4.033403
max,79.49,-73.776184,40.909973,-73.587708,41.043755,5.0,0.277458,0.296982,0.396683,0.302198,30.891908


In [13]:
data_manipulated.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0,98944.0
mean,11.501462,-73.974159,40.750822,-73.973617,40.751591,1.645476,0.021339,0.02376,0.045099,0.034898,3.394644
std,9.072024,0.036963,0.027705,0.034712,0.031731,1.0839,0.023589,0.035833,0.052817,0.040527,3.764279
min,2.7446,-74.291595,40.470993,-74.291611,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.42054,-73.99205,40.736729,-73.991379,40.73574,1.0,0.00687,0.006187,0.016201,0.012678,1.242527
50%,8.563286,-73.981918,40.753759,-73.980019,40.754349,1.0,0.01376,0.012672,0.027721,0.021516,2.120602
75%,12.514358,-73.967588,40.768227,-73.963333,40.769345,2.0,0.02673,0.024231,0.050611,0.038623,3.907561
max,84.65871,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.377335,0.495525,0.380997,34.818911


In [17]:
# Merge Sub-Datasets

drifted_data = pd.concat([data_2015_jan,data_2015_feb, data_2015_mar, data_2015_apr], ignore_index=True)
drifted_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0
mean,11.359241,-73.97445,40.750903,-73.973871,40.751652,2.026333,0.021013,0.023253,0.044266,0.034255,3.334214
std,8.925246,0.036203,0.027514,0.034092,0.031439,1.070907,0.023293,0.035208,0.051959,0.039888,3.70783
min,2.7446,-74.193726,40.470993,-74.239204,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.376187,-73.991974,40.736666,-73.991325,40.73587,1.0,0.006771,0.006111,0.015968,0.012485,1.223678
50%,8.496203,-73.981972,40.753761,-73.980057,40.754478,2.0,0.013542,0.012497,0.027248,0.02121,2.088833
75%,12.375512,-73.967705,40.768049,-73.96347,40.76931,3.0,0.026312,0.023796,0.049652,0.037952,3.843814
max,84.65871,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.304008,0.495525,0.350499,34.818911


In [24]:
# Merge Sub-Datasets

drifted_data = pd.concat([data_2015_jan,data_2015_feb, data_2015_mar, data_2015_apr], ignore_index=True)
drifted_data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0,66039.0
mean,11.359241,-73.97445,40.750903,-73.973871,40.751652,2.026333,0.021013,0.023253,0.044266,0.034255,3.334214
std,8.925246,0.036203,0.027514,0.034092,0.031439,1.070907,0.023293,0.035208,0.051959,0.039888,3.70783
min,2.7446,-74.193726,40.470993,-74.239204,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.376187,-73.991974,40.736666,-73.991325,40.73587,1.0,0.006771,0.006111,0.015968,0.012485,1.223678
50%,8.496203,-73.981972,40.753761,-73.980057,40.754478,2.0,0.013542,0.012497,0.027248,0.02121,2.088833
75%,12.375512,-73.967705,40.768049,-73.96347,40.76931,3.0,0.026312,0.023796,0.049652,0.037952,3.843814
max,84.65871,-73.532593,40.970356,-73.532593,41.057537,5.0,0.281437,0.304008,0.495525,0.350499,34.818911


In [25]:
# SAVE DATASETS

# data_2015_jan.to_csv('02_Passenger Count/passenger count_drift_jan_01.csv', index = False)
# data_2015_feb.to_csv('02_Passenger Count/passenger count_drift_feb_01.csv', index = False)
# data_2015_mar.to_csv('02_Passenger Count/passenger count_drift_mar_01.csv', index = False)
# data_2015_apr.to_csv('02_Passenger Count/passenger count_drift_apr_01.csv', index = False)
drifted_data.to_csv('02_Passenger Count/passenger count_drift_total_03.csv', index = False)

In [6]:
data_check_jan = pd.read_csv('02_Passenger Count/passenger count_drift_jan_01.csv');
data_check_feb = pd.read_csv('02_Passenger Count/passenger count_drift_feb_01.csv');
data_check_mar = pd.read_csv('02_Passenger Count/passenger count_drift_mar_01.csv');
data_check_apr = pd.read_csv('02_Passenger Count/passenger count_drift_apr_01.csv');
data_check_apr.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_lat_diff,abs_lon_diff,manhattan,euclidean,haversine
count,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0,16857.0
mean,13.147689,-73.974358,40.750763,-73.973634,40.751618,1.749896,0.021834,0.024233,0.046067,0.035636,3.467226
std,10.988736,0.036788,0.02785,0.034339,0.031787,1.091114,0.023995,0.036289,0.053451,0.04107,3.817332
min,2.5,-74.186302,40.586666,-74.218719,40.535355,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.5,-73.992134,40.736412,-73.991295,40.736198,1.0,0.007122,0.006378,0.01664,0.013004,1.283343
50%,9.5,-73.981972,40.753849,-73.979729,40.754513,1.0,0.01405,0.012978,0.028336,0.021901,2.162592
75%,15.0,-73.967476,40.768356,-73.963654,40.769314,2.0,0.027382,0.024666,0.052189,0.039848,4.033403
max,98.75,-73.776184,40.909973,-73.587708,41.043755,5.0,0.277458,0.296982,0.396683,0.302198,30.891908
