In [1]:
# Data processing
import pandas as pd
import numpy as np
import datetime as dt

# Geocoding
import geopy
from geopy.geocoders import Nominatim
from geopy.distance import geodesic as geodesic

# Visualization
import seaborn as sb
import matplotlib.pyplot as plt

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Control Variables

In [4]:
PATH = "./all/basic_filter-"

# read whole file use None
# ROWS = None 
ROWS = 9_000_000 
basic_filter_df = pd.read_feather(PATH+str(ROWS))

LEFT_BOTTOM_CITY = 'Tennessee'
RIGHT_TOP_CITY = 'Maine'

NYC_LONGITUDE = (-74.256436, -73.699733)
NYC_LATITUDE = (40.495029, 40.915592)

JFK_LONGITUDE = (-73.789185, -73.775291)
JFK_LATITUDE = (40.641960, 40.649562)

  return feather.read_dataframe(path, nthreads=nthreads)


In [3]:
geolocator = Nominatim(user_agent='city')
LEFT_BOTTOM_BOUNDARY = geolocator.geocode(LEFT_BOTTOM_CITY)
RIGHT_TOP_BOUNDARY = geolocator.geocode(RIGHT_TOP_CITY)

In [4]:
# Taxicab Rate of Fare
MILEAGE = 0.4 * 5

# Auxiliary Functions

In [5]:
def basic_pearson_corr_result(df, title):
    print('{title}, Pearson correlation coef. of:'.format(title=title))
    format_rule = '    {0:30}'
    # Euclidean distance of the ride and the taxi fare
    print(format_rule.format('dist and fare: '), \
          df['dist'].corr(df['fare_amount']))
    # time of day and distance traveled
    print(format_rule.format('time of day and dist: '), \
          df['hour'].corr(df['dist']))
    # time of day and the taxi fare
    print(format_rule.format('time of day and fare: '), \
          df['hour'].corr(df['fare_amount']))
    
def basic_visualization_result(df):
    df.plot(kind='scatter', x='dist', y='fare_amount')
    df.plot(kind='scatter', x='dist', y='hour')
    df.plot(kind='scatter', x='hour', y='fare_amount')
    
def compare_two_df(df1, title1, df2, title2):
    title_rule = '    {}'
    data_rule = '        {0:30}'

    # Euclidean distance of the ride and the taxi fare
    print('Pearson correlation coef. of:')
    print(title_rule.format('* dist and fare: '))
    print(data_rule.format(title1), \
          df1['dist'].corr(df1['fare_amount']))
    print(data_rule.format(title2), \
          df2['dist'].corr(df2['fare_amount']))

    print(title_rule.format('* time of day and dist: '))
    print(data_rule.format(title1), \
          df1['hour'].corr(df1['dist']))
    print(data_rule.format(title2), \
          df2['hour'].corr(df2['dist']))

    print(title_rule.format('* time of day and fare: '))
    print(data_rule.format(title1), \
          df1['hour'].corr(df1['fare_amount']))
    print(data_rule.format(title2), \
          df2['hour'].corr(df2['fare_amount']))
 
    fig = plt.figure(figsize=(32,32))
    ax1_1 = fig.add_subplot(321)
    ax2_1 = fig.add_subplot(322)
    ax1_2 = fig.add_subplot(323)
    ax2_2 = fig.add_subplot(324)
    ax1_3 = fig.add_subplot(325)
    ax2_3 = fig.add_subplot(326)

    df1.plot(ax=ax1_1, kind='scatter', x='dist', y='fare_amount', title = title1)
    df1.plot(ax=ax1_2, kind='scatter', x='dist', y='hour', title = title1)
    df1.plot(ax=ax1_3, kind='scatter', x='hour', y='fare_amount', title = title1)
    df2.plot(ax=ax2_1, kind='scatter', x='dist', y='fare_amount', title = title2)
    df2.plot(ax=ax2_2, kind='scatter', x='dist', y='hour', title = title2)
    df2.plot(ax=ax2_3, kind='scatter', x='hour', y='fare_amount', title = title2)

## Basic Filter

In [6]:
def valid_fare(row_data):
    """
    this is a soft cut based on initial chare $2.50 and the improvement surcharge 30-cent
    """
    return row_data['fare_amount'] > 2.8

def valid_transportation_region(row_data, left_bottom = LEFT_BOTTOM_BOUNDARY, right_top = RIGHT_TOP_BOUNDARY):
    """
    default region based on the rectangle formed by Tennessee and Maine, 
    since it takes more than 12 hrs to drive from the boundary to NYC or
    from NYC to the boundary
    """
    return (row_data['pickup_longitude'] > left_bottom.longitude) & \
           (row_data['pickup_longitude'] < right_top.longitude) & \
           (row_data['dropoff_longitude'] > left_bottom.longitude) & \
           (row_data['dropoff_longitude'] < right_top.longitude) & \
            (row_data['pickup_latitude'] > left_bottom.latitude) & \
            (row_data['pickup_latitude'] < right_top.latitude) & \
            (row_data['dropoff_latitude'] > left_bottom.latitude) & \
            (row_data['dropoff_latitude'] < right_top.latitude)

def valid_travel_location(row_data):
    """
    pickup and dropoff should not be exactly the same or
    both outside the NYC 
    """
    return (row_data['pickup_longitude'] != row_data['dropoff_longitude']) | \
           (row_data['pickup_latitude'] != row_data['dropoff_latitude']) | \
           (\
            (row_data['pickup_longitude'] >= NYC_LONGITUDE[0]) & \
            (row_data['pickup_longitude'] <= NYC_LONGITUDE[1]) & \
            (row_data['pickup_latitude'] >= NYC_LATITUDE[0]) & \
            (row_data['pickup_latitude'] <= NYC_LATITUDE[1]) \
           ) | (\
            (row_data['dropoff_longitude'] >= NYC_LONGITUDE[0]) & \
            (row_data['dropoff_longitude'] <= NYC_LONGITUDE[1]) & \
            (row_data['dropoff_latitude'] >= NYC_LATITUDE[0]) & \
            (row_data['dropoff_latitude'] <= NYC_LATITUDE[1]) \
           )

def valid_passenger_count(row_data):
    """
    by Taxi rule, the maximum capacity is 6
    """
    return (row_data['passenger_count'] > 0) & (row_data['passenger_count'] <= 6)

# def datetime_parser(x):
#     print(x['pickup_datetime'])
#     try:
#         datetime_object = datetime.strptime(x['pickup_datetime'], '%Y-%m-%d %H:%M:%S')
#         return datetime_object.hours*60 + datetime_object.minutes
#     except:
#         return pd.NaT

## Advanced Filter

In [7]:
def valid_fare_amount_judge_by_travel_distance(row_data):
    return valid_fare_amount_with_long_distance(row_data) & \
           valid_fare_amount_with_short_distance(row_data)

def valid_fare_amount_with_long_distance(row_data):
    return row_data['fare_amount'] >= (row_data['dist'] * MILEAGE + 2.8)

def valid_fare_amount_with_short_distance(row_data):
    return (row_data['dist'] >= 0.2) | (row_data['fare_amount'] <= 10)

def to_jfk(row_data):
    return (row_data['dropoff_longitude'] >= JFK_LONGITUDE[0]) & \
           (row_data['dropoff_longitude'] <= JFK_LONGITUDE[1]) & \
           (row_data['dropoff_latitude'] >= JFK_LATITUDE[0]) & \
           (row_data['dropoff_latitude'] <= JFK_LATITUDE[1])

def remove_outliers_Tukey(usefulData, attr):
    thirdQuartile = usefulData.quantile(.75)[attr]
    firstQuartile = usefulData.quantile(.25)[attr]
    IQR = thirdQuartile - firstQuartile
    return usefulData[usefulData[attr].between(firstQuartile - (IQR * 1.5), thirdQuartile + (IQR * 1.5))]

## Adjust or Create Feature

In [8]:
from math import sin, cos, sqrt, atan2, radians

def dist(row_data):
    R = 6373.0
    s_lon = radians(row_data['pickup_longitude'])
    s_lat = radians(row_data['pickup_latitude'])
    e_lon = radians(row_data['dropoff_longitude'])
    e_lat = radians(row_data['dropoff_latitude'])
    diff_lon = e_lon - s_lon
    diff_lat = e_lat - s_lat

    a = sin(diff_lat / 2)**2 + cos(s_lat) * cos(e_lat) * sin(diff_lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c * 0.621371

def dist_by_geopy(row_data):
    s_lon = row_data['pickup_longitude']
    s_lat = row_data['pickup_latitude']
    e_lon = row_data['dropoff_longitude']
    e_lat = row_data['dropoff_latitude']
    return geodesic((s_lon, s_lat), (e_lon, e_lat)).miles

# clean_data['dist'] = np.vectorize(dist)(clean_data['pickup_longitude'], clean_data['pickup_latitude'], clean_data['dropoff_longitude'], clean_data['dropoff_latitude'])
# https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude/19412565

def insert_est_or_edt_time(df):
    df.insert(df.shape[1],'new_york_time',
              df.pickup_datetime.dt.tz_localize('utc').\
              dt.tz_convert('America/New_York'))
    
def insert_year(df):
    df.insert(df.shape[1], 'year', df.new_york_time.dt.year)

def insert_weekday(df):
    df.insert(df.shape[1], 'weekday', df.new_york_time.dt.weekday)

def insert_time_of_day(df):
    df.insert(df.shape[1], 'hour', df.new_york_time.dt.hour + df.new_york_time.dt.minute/60)

# def insert_dist(df, func = dist):
#     euclidean_dist = df.apply(func, axis = 1)
#     df.insert(df.shape[1], 'dist', euclidean_dist)
    
def insert_feature(df, func = None, feature_name = None):
    if not func or not feature_name:
        raise InputError

    new_data = df.apply(func, axis = 1)

    df.insert(df.shape[1], feature_name, new_data)

# LinearRegression

In [None]:
def simple_linear_regression(df):
    train, test = train_test_split(df, test_size=0.3)
    train_x = list(map(lambda x: [x], list(train.dist)))
    train_y = list(map(lambda x: [x], list(train.fare_amount)))
    test_x = list(map(lambda x: [x], list(test.dist)))
    test_y = list(map(lambda x: [x], list(test.fare_amount)))

    regr = linear_model.LinearRegression()
    regr.fit(train_x, train_y)
    fare_predict = regr.predict(test_x)

    # The coefficients
    print('Coefficients: \n', regr.coef_)
    # The mean squared error
    print("Mean squared error: %.2f"
          % mean_squared_error(test_y, fare_predict))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(test_y, fare_predict))

    # Plot outputs
    plt.scatter(test_x, test_y,  color='blue')
    plt.plot(test_x, fare_predict, color='red', linewidth=3)

    plt.xticks(())
    plt.yticks(())

    plt.show()

# Adjust Fare Amout By Rules (http://www.nyc.gov/html/tlc/html/passenger/taxicab_rate.shtml)
```
Metered Fare Information
Onscreen rate is ‘Rate #01 – Standard City Rate.’
The initial charge is $2.50.
Plus 50 cents per 1/5 mile or 50 cents per 60 seconds in slow traffic or when the vehicle is stopped.
In moving traffic on Manhattan streets, the meter should “click” approximately every four downtown blocks, or one block going cross-town (East-West).
There is a 50-cent MTA State Surcharge for all trips that end in New York City or Nassau, Suffolk, Westchester, Rockland, Dutchess, Orange or Putnam Counties.
There is a 30-cent Improvement Surcharge. 
There is a daily 50-cent surcharge from 8pm to 6am.
There is a $1 surcharge from 4pm to 8pm on weekdays, excluding holidays.
Passengers must pay all bridge and tunnel tolls.
Your receipt will show your total fare including tolls. Please take your receipt.
The driver is not required to accept bills over $20.
Please tip your driver for safety and good service.
There are no charges for extra passengers or bags.

To/From JFK and any location in Manhattan:
Onscreen rate is ‘Rate #02 – JFK Airport.’
This is a flat fare of $52 plus tolls, the 50-cent MTA State Surcharge, the 30-cent Improvement Surcharge, and $4.50 rush hour surcharge (4 PM to 8 PM weekdays, excluding legal holidays).
Passenger is responsible for paying all tolls.
Please tip your driver for safety and good service.
```

In [9]:
def adjust_fare(row_data):
    fare = row_data['fare_amount']
    
    if to_jfk(row_data): # should consider from_jfk too 
        # flat fare of $52
        fare -= 52
        # MTA State Surcharge
        fare -= 0.5
        # 30-cent Improvement Surcharge,
        fare -= 0.3
        
        if is_rush(row_data):
            fare -= 4.5
    else:
        pass

# Q1 Take a look at the training data ...

## There may be anomalies in the data that you may need to factor in before you start on the other tasks. Clean the data first to handle these issues. Explain what you did to clean the data (in bulleted form).

# Data Cleaning

# Data Cleaning and Visualization
---

## Remove trivial anomaly as follows:
* data['fare_amount'] <= 2.5
* data['longitude'] >= -69.445473 or data['longitude'] <= -86.580444
* data['latitude'] <= 35.517490 or data['latitude'] >= 45.253784
* pickup location == dropoff location
* data['passenger_count'] < 0 or data['passenger_count'] > 6

>[How many people can fit into a yellow taxicab?](#http://www.nyc.gov/html/tlc/html/faq/faq_pass.shtml)

> From Driver Rule 54-15(g) (in PDF):
The maximum amount of passengers allowed in a yellow taxicab by law is four (4) in a four (4) passenger taxicab or five (5) passengers in a five (5) passenger taxicab, except that an additional passenger must be accepted if such passenger is under the age of seven (7) and is held on the lap of an adult passenger seated in the rear.

In [None]:
format_rule = '{0:54}'

print(format_rule.format('Size before data cleaning:'), basic_filter_df.shape[0])
basic_filter_df = basic_filter_df.dropna(how = 'any', axis = 'rows')
print(format_rule.format('Size after removing N/A row data:'), basic_filter_df.shape[0])

# remove invalid fare
basic_filter_df = basic_filter_df.loc[valid_fare(basic_filter_df)]
print(format_rule.format('Size after removing invalid fare:'), basic_filter_df.shape[0])

# remove invalid transportation region
basic_filter_df = basic_filter_df.loc[valid_transportation_region(basic_filter_df)]
print(format_rule.format('Size after removing invalid longitude and latitude:'), basic_filter_df.shape[0])

# remove invalid pickup and dropoff location
basic_filter_df = basic_filter_df.loc[valid_travel_location(basic_filter_df)]
print(format_rule.format('Size after removing invalid travel location:'), basic_filter_df.shape[0])

# remove invalid passenger count
basic_filter_df = basic_filter_df.loc[valid_passenger_count(basic_filter_df)]
print(format_rule.format('Size after removing invalid passenger count:'), basic_filter_df.shape[0])

basic_filter_df.describe()

Size before data cleaning:                             9000000
Size after removing N/A row data:                      8999943
Size after removing invalid fare:                      8962440
Size after removing invalid longitude and latitude:    8780330
Size after removing invalid travel location:           8777255
Size after removing invalid passenger count:           8746356


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,8746356.0,8746356.0,8746356.0,8746356.0,8746356.0,8746356.0
mean,11.35814,-73.97553,40.75083,-73.97464,40.75118,1.691232
std,9.694958,0.0432581,0.03297281,0.04198436,0.03603798,1.30633
min,2.82,-85.52603,35.79768,-85.96667,35.82801,1.0
25%,6.0,-73.99229,40.73655,-73.99159,40.73559,1.0
50%,8.5,-73.98211,40.75335,-73.98062,40.75386,1.0
75%,12.5,-73.96838,40.76753,-73.96543,40.7684,2.0
max,952.0,-68.95357,45.67449,-68.89554,45.67527,6.0


# Data Preprocessing

In [None]:
insert_year(basic_filter_df)
insert_weekday(basic_filter_df)
insert_time_of_day(basic_filter_df)
insert_feature(basic_filter_df, feature_name = 'dist', func = dist)

basic_filter_df.head()

## Pearson Correlation Coefficients, 
## and Visualization of the Relation between the Variables

In [None]:
# remove anomaly by using advanced filter
advanced_filter_df = base_filter_df[valid_fare_amount_judge_by_travel_distance(base_filter_df)]

# advanced_filter_df = removeOutliersTukey(advanced_filter_df,'pickup_longitude')
# advanced_filter_df = removeOutliersTukey(advanced_filter_df,'pickup_latitude')
# advanced_filter_df = removeOutliersTukey(advanced_filter_df,'dropoff_longitude')
# advanced_filter_df = removeOutliersTukey(advanced_filter_df,'dropoff_latitude')
# print('Data after 2nd cleaning:')
# advanced_filter_df.describe()

compare_two_df(base_filter_df, 'Applied basic filter', advanced_filter_df, 'Applied advanced filter')

In [None]:
basic_pearson_corr_result(advanced_filter_df, 'Applied advanced filter')
basic_visualization_result(advanced_filter_df)

# Simple Linear Regression Model

# Prediction Baseline

In [None]:
# test_df_dist = test_df.apply(dist, axis=1)
# test_df.insert(test_df.shape[1], 'dist', test_df_dist)

In [None]:
# test_X = list(map(lambda x: [x], list(test_df.dist)))

In [None]:
# fare_predict = regr.predict(test_X).round(decimals = 2)

# Create submission file

In [None]:
# submission = pd.DataFrame({'key': test_df.key,\
#                            'fare_amount': fare_predict.ravel()},\
#                           columns = ['key', 'fare_amount'])
# submission.to_csv('submission.csv', index = False)


In [None]:
# submission.shape

# Extra Features 
## By year
## By weekday
## By time_of_day (hour)

## Extra Features - Adjusted_Fare_Amount_By_Rules

In [None]:
base_filter_df.reset_index().to_feather('./all/basic_filter-' + str(ROWS))
advanced_filter_df.reset_index().to_feather('./all/advanced_filter-' + str(ROWS))

In [None]:
headers = list(advanced_filter_df.columns.values)

for header in headers:
    print(header)

In [None]:
headers = zip(list(advanced_filter_df.columns.values), list(base_filter_df.columns.values))

In [None]:
for h1, h2 in headers:
    print('{:30}, {:30}'.format(h1, h2))

In [None]:
# years_df = [None for _ in range(min(clean_data.year), max(clean_data.year) + 1)]
# minimum_year = min(clean_data.year)

# years = max(clean_data.year) + 1 - minimum_year
# for year in range(years):
#     years_df[year] = clean_data[clean_data.year == minimum_year+year]
      
#     train, test = train_test_split(years_df[year], test_size=0.2)
#     train_x = list(map(lambda x: [x], list(train.dist)))
#     train_y = list(map(lambda x: [x], list(train.fare_amount)))
#     test_x = list(map(lambda x: [x], list(test.dist)))
#     test_y = list(map(lambda x: [x], list(test.fare_amount)))
#     regr.fit(train_x, train_y)
#     fare_predict = regr.predict(test_x)

#     print(years_df[year]['dist'].corr(years_df[year]['fare_amount']))
#     # The coefficients
#     print('Year: {}'.format(minimum_year + year), 'Coeff: ', regr.coef_, \
#           "M.S.E: %.2f" % (mean_squared_error(test_y, fare_predict)), \
#           'Variance score: %.2f' % r2_score(test_y, fare_predict))
#     print('\n')

    # Plot outputs
#     plt.scatter(test_x, test_y,  color='blue')
#     plt.plot(test_x, fare_predict, color='red', linewidth=3)

#     plt.xticks(())
#     plt.yticks(())

    
# plt.show()

# Night Surcharge

In [None]:
# def night_surcharge(row_data, night = True):
#     if night:
#         return (row_data['hour'] >= 20) | (row_data['hour'] <= 6)
#     else:
#         return (row_data['hour'] < 20) & (row_data['hour'] > 6)
    
# def weekday_filter(row_data):
#     return row_data.weekday < 5 


In [None]:
# for year in range(years):
#     night = None
#     night = years_df[year][night_surcharge(years_df[year], True)]
#     train, test = train_test_split(night, test_size=0.2)
#     train_x = list(map(lambda x: [x], list(train.dist)))
#     train_y = list(map(lambda x: [x], list(train.fare_amount)))
#     test_x = list(map(lambda x: [x], list(test.dist)))
#     test_y = list(map(lambda x: [x], list(test.fare_amount)))
#     regr.fit(train_x, train_y)
#     fare_predict = regr.predict(test_x)

#     print(night['dist'].corr(night['fare_amount']))
#     # The coefficients
#     print('Year: {}'.format(minimum_year + year), 'Coeff: ', regr.coef_, \
#           "M.S.E: %.2f" % (mean_squared_error(test_y, fare_predict)), \
#           'Variance score: %.2f' % r2_score(test_y, fare_predict))
#     print('\n')

    # Plot outputs
#     plt.scatter(test_x, test_y,  color='blue')
#     plt.plot(test_x, fare_predict, color='red', linewidth=3)

#     plt.xticks(())
#     plt.yticks(())

    
# plt.show()

In [None]:
# for year in range(years):
#     flag = [True, False]
#     weekday = None
#     weekday = years_df[year][weekday_filter(years_df[year])]
#     train, test = train_test_split(weekday, test_size=0.2)
#     train_x = list(map(lambda x: [x], list(train.dist)))
#     train_y = list(map(lambda x: [x], list(train.fare_amount)))
#     test_x = list(map(lambda x: [x], list(test.dist)))
#     test_y = list(map(lambda x: [x], list(test.fare_amount)))
#     regr.fit(train_x, train_y)
#     fare_predict = regr.predict(test_x)

#     print(weekday['dist'].corr(weekday['fare_amount']))
#     # The coefficients
#     print('Year: {}'.format(minimum_year + year), 'Coeff: ', regr.coef_, \
#           "M.S.E: %.2f" % (mean_squared_error(test_y, fare_predict)), \
#           'Variance score: %.2f' % r2_score(test_y, fare_predict))
#     print('\n')

#     # Plot outputs
#     plt.scatter(test_x, test_y,  color='blue')
#     plt.plot(test_x, fare_predict, color='red', linewidth=3)

#     plt.xticks(())
#     plt.yticks(())

    
#     plt.show()