In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

# Data Preparation
## Trip Duration Clean-up
Deal with **outliers** associated with the `trip_duration` variable  
Exclude data that lies outside 2 standard deviations from the mean. 

In [None]:
m = np.mean(train['trip_duration'])
s = np.std(train['trip_duration'])
train = train[train['trip_duration'] <= m + 2*s]
train = train[train['trip_duration'] >= m - 2*s]

## Check latitude/longitude bounds
Latitude: -85 to +85, Longitude: -180 to +180

In [None]:
print('Latitude bounds: {} to {}'.format(
    max(train.pickup_latitude.min(), train.dropoff_latitude.min()),
    max(train.pickup_latitude.max(), train.dropoff_latitude.max())
))

In [None]:
print('Longitude bounds: {} to {}'.format(
    max(train.pickup_longitude.min(), train.dropoff_longitude.min()),
    max(train.pickup_longitude.max(), train.dropoff_longitude.max())
))

## Latitude and Longitude Clean-up

Below shows the borders of NY City by Google Map.  
Then get the border ranges in coordinates.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img1 = mpimg.imread('NYC.png')
plt.imshow(img1)
plt.show()

In [None]:
img2 = mpimg.imread('NYC East.png')
plt.imshow(img2)
plt.show()

In [None]:
img3 = mpimg.imread('NYC West.png')
plt.imshow(img3)
plt.show()

In [None]:
img4 = mpimg.imread('NYC North.png')
plt.imshow(img4)
plt.show()

The borders of NY City, in coordinates comes out to be:

city_long_border = (-74.255, -73.701)  
city_lat_border = (40.508, 40.916) 

From the coordinates above, we can limit our area of investigation to within the NY City borders.

In [None]:
city_long_x=-74.255
city_long_y=-73.701
city_lat_x =40.508
city_lat_y=40.916
train = train[train['pickup_longitude'] <= city_long_y]
train = train[train['pickup_longitude'] >= city_long_x]
train = train[train['pickup_latitude'] <= city_lat_y]
train = train[train['pickup_latitude'] >= city_lat_x]
train = train[train['dropoff_longitude'] <= city_long_y]
train = train[train['dropoff_longitude'] >= city_long_x]
train = train[train['dropoff_latitude'] <= city_lat_y]
train = train[train['dropoff_latitude'] >= city_lat_x]

## Check Missing Values 
check whether we have data missing in the train and the test set.

In [None]:
print(train.isnull().sum())

In [None]:
print(test.isnull().sum())

**Good! no N/A values**

## Check for duplicate ids

In [None]:
print('No of Duplicates, Trip IDs: {}'.format(len(train) - 
                                              len(train.drop_duplicates(subset='id'))))

## Check Number of Passengers
The common sense implies values between 1 and 10, we should check 0

In [None]:
print('Passengers: {} to {}'.format(train.passenger_count.min(), 
                                        train.passenger_count.max()))

## Passenger_Count Clean-up

let's also drop trips with passenger count = 0

In [None]:
print('Empty trips: {}'.format(train[train.passenger_count == 0].shape[0]))
train = train[train.passenger_count > 0]