This hackathon will try to address the challenges faced by taxi operators in quoting the right fare to customers before starting the trip. However, the details are shared with taxi drivers or operators related to the trip, they find it difficult to quote the right fare because of uncertainties and calculation complexities. 

The same issue is faced by passengers as well because of inaccurate or irrelevant fares quoted. To find a solution for this, this hackathon provides a historical dataset to participants that includes records of taxi trip details and fares of those trips. 

**Using this dataset, the participants need to build machine learning models for predicting the trip fare based on the given other useful features of the trip.**

* Distance
* Time of Travel
* Duration of Travel
* Tolls
* Location
* Type of Car
* Price
* Day or Night
* Demand and Supply
* Rating of the Driver
* Payment Type - may or may not work..
* Working Day or Weekend
* Cab Sharing/Pooled Car
* Tip shall have an impact...


In [22]:
# Import the Libraries and the Data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [23]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
train.head()

In [None]:
# Combine the Dataset

combined = pd.concat([train, test], ignore_index = True)

In [None]:
train.shape, test.shape, combined.shape

### Univariate Analysis

In [None]:
# List of the Numerical Vars...

#combined.select_dtypes(include = np.number).columns

In [None]:
num_cols = ['trip_distance', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'imp_surcharge',
       'total_amount', 'trip_duration','calculated_total_amount']

nrows = 5
ncols =2
iterator = 1

plt.figure(figsize=(10, 7))
for i in num_cols:
    plt.subplot(nrows, ncols, iterator)
    sns.distplot(combined.loc[:, i])
    iterator = iterator+1

plt.tight_layout()
plt.show()

In [None]:
cat_cols = ['store_and_fwd_flag', 'rate_code', 
            'payment_type', 'pickup_location_id', 
            'dropoff_location_id', 'year',
            'month', 'day', 'day_of_week', 'hour_of_day']

nrows = 5
ncols =2
iterator = 1

plt.figure(figsize=(10, 7))
for i in cat_cols:
    plt.subplot(nrows, ncols, iterator)
    sns.countplot(combined.loc[:, i])
    iterator = iterator+1

plt.tight_layout()
plt.show()

### Bivariate Analysis

* Num vs Target:: Scatterplot
* Cat vs Target:: Boxplot

In [None]:
# Numerical Vars...
nrows = 5
ncols =2
iterator = 1

plt.figure(figsize=(10, 7))
for i in num_cols:
    plt.subplot(nrows, ncols, iterator)
    sns.scatterplot(combined.loc[:, i], 
                    combined.calculated_total_amount, 
                   color = "hotpink")
    iterator = iterator+1

plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(x = "fare_amount", y = "calculated_total_amount",
               data = combined, hue = "rate_code")
plt.show()

In [None]:
# Cat vs Number

nrows = 5
ncols =2
iterator = 1

plt.figure(figsize=(10, 7))
for i in cat_cols:
    plt.subplot(nrows, ncols, iterator)
    sns.boxplot(combined.loc[:, i], 
                combined.calculated_total_amount)
    iterator = iterator+1

plt.tight_layout()
plt.show()

In [None]:
# Missing Values
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
sns.boxplot(combined.calculated_total_amount)

### Feature Engineering

In [None]:
combined.head()

In [None]:
# Trip Duration in Hours
combined["trip_duration"] = combined["trip_duration"]/3600

In [None]:
# Pick Up and Drop Location
combined.pickup_location_id.value_counts()[:5].index

In [None]:
localdf = pd.DataFrame(combined.pickup_location_id.astype(str)\
                       +"_"+combined.dropoff_location_id\
                       .astype(str), 
                       columns = ["Loc"])

In [None]:
combined["pick_&_drop"] = pd.DataFrame(combined.pickup_location_id.astype(str)\
                       +"_"+combined.dropoff_location_id\
                       .astype(str))

In [None]:
localdf.groupby("Loc")["tgt"].describe()

In [None]:
combined["magic1"] = combined.groupby("pick_&_drop")\
["calculated_total_amount"].transform("count")

combined["magic2"] = combined.groupby("pick_&_drop")\
["calculated_total_amount"].transform("mean")

combined["magic3"] = combined.groupby("pick_&_drop")\
["calculated_total_amount"].transform("median")

combined["magic4"] = combined.groupby("pick_&_drop")\
["calculated_total_amount"].transform("min")

combined["magic5"] = combined.groupby("pick_&_drop")\
["calculated_total_amount"].transform("max")

In [None]:
pd.set_option("display.max_columns", 50)
combined.head()

In [None]:
# Hour of the Day
sns.boxplot(y=combined.hour_of_day, 
            x=combined.calculated_total_amount, orient = "h")

# Busy Hours: 5,7,10,11,13,14,15,19,20,21,23

In [None]:
# Busy Hour...
busy = [0, 5,7,10,11,13,14,15,19,20,21,23]

def imbusy(x):
    if x in busy:
        return("Peak_Hour")
    else:
        return("Ease_Hour")

In [None]:
combined["magic6"] = combined.hour_of_day.apply(imbusy)

In [None]:
sns.boxplot(combined.magic6, combined.calculated_total_amount)

In [None]:
zones = pd.read_csv("C:/Users/IT/Downloads/taxi+_zone_lookup.csv")

In [None]:
zones.head(1)

In [None]:
pickup_zones = combined.pickup_location_id.unique()

In [None]:
for i in zones.LocationID:
    if i in pickup_zones:
        print(zones.Zone)
    else:
        print("No Zone")

In [None]:
combined.head()

In [None]:
sns.boxplot(y = combined.day, 
            x = combined.calculated_total_amount, 
           orient = "h")
plt.show()

In [None]:
busy_days = [4, 5, 6, 8, 15, 16, 22, 24, 27]

def busy(x):
    if x in busy_days:
        return("Busy_Days")
    else:
        return("Lean_Days")

In [None]:
combined["magic7"] = combined.day.apply(busy)

In [None]:
sns.boxplot(combined.magic7, combined.calculated_total_amount)

In [None]:
# Day wise avg Fare

combined.groupby("day")["calculated_total_amount"].describe()

combined["magic8"] = combined.groupby("day")\
["calculated_total_amount"].transform("count")

combined["magic9"] = combined.groupby("day")\
["calculated_total_amount"].transform("mean")

combined["magic10"] = combined.groupby("day")\
["calculated_total_amount"].transform("median")

combined["magic11"] = combined.groupby("day")\
["calculated_total_amount"].transform("min")

combined["magic12"] = combined.groupby("day")\
["calculated_total_amount"].transform("max")

In [None]:
combined.groupby("month")["calculated_total_amount"].describe()

combined["magic13"] = combined.groupby("month")\
["calculated_total_amount"].transform("count")

combined["magic14"] = combined.groupby("month")\
["calculated_total_amount"].transform("mean")

combined["magic15"] = combined.groupby("month")\
["calculated_total_amount"].transform("median")

combined["magic16"] = combined.groupby("month")\
["calculated_total_amount"].transform("min")

combined["magic17"] = combined.groupby("month")\
["calculated_total_amount"].transform("max")

In [None]:
combined.drop(['pickup_location_id', 'dropoff_location_id',
              'year', 'month', 'day', 
              'hour_of_day', 'pick_&_drop'], axis = 1, 
             inplace = True)

In [None]:
combined["magic18"] = combined["trip_distance"]/combined['trip_duration']

In [None]:
# Split the Data back in train and test
newtrain = combined.loc[0:train.shape[0]-1, ]
newtest = combined.loc[train.shape[0]:, ]

In [None]:
newtest.drop("calculated_total_amount",axis =1, inplace = True)

In [None]:
# Apply Mgic 1 to 5

newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].describe()

newtrain["magic1"] = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].transform("count")

newtrain["magic2"] = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].transform("mean")

newtrain["magic3"] = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].transform("median")

newtrain["magic4"] = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].transform("min")

newtrain["magic5"] = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].transform("max")

In [None]:
mapped = newtrain.groupby("pick_&_drop")\
["calculated_total_amount"].\
apply(lambda x:np.round(x.median(),2)).to_dict()

In [None]:
newtest["magic3"] = newtest["pick_&_drop"].map(mapped)

In [None]:
newtrain.columns

In [None]:
newtrain.drop(['pickup_location_id',
       'dropoff_location_id', 'year', 
               'month', 'day', 'day_of_week',
       'hour_of_day', 'magic1', 'magic2',
               'magic3', 'magic4', 'magic5','pick_&_drop'], 
              axis = 1, inplace = True)


newtest.drop(['pickup_location_id',
       'dropoff_location_id', 'year', 
               'month', 'day', 'day_of_week',
       'hour_of_day', 'magic1', 'magic2',
               'magic3', 'pick_&_drop'], 
              axis = 1, inplace = True)

#### Statistical Testing

In [None]:
num_cols = ['trip_distance','fare_amount','extra',
            'mta_tax','tip_amount','tolls_amount',
            'imp_surcharge','total_amount','trip_duration',
            'magic8', 'magic9','magic10', 'magic11', 
            'magic12', 'magic13', 'magic14', 'magic15',
            'magic16', 'magic17', 'magic18']

In [None]:
import scipy.stats as stats

imp_feats = []
for i in num_cols:
    teststats, pvalue = stats.ttest_ind(newtrain.loc[:, i],
                                       newtrain.calculated_total_amount)
    if pvalue<0.05:
        imp_feats.append(i)

In [None]:
# rate code and payment Type

import statsmodels.formula.api as sfa

from statsmodels.stats.anova import anova_lm

model = sfa.ols("calculated_total_amount~payment_type", 
                data = newtrain).fit()

anova_lm(model)

In [None]:
dummytrain= pd.get_dummies(newtrain, drop_first = True)
dummytest= pd.get_dummies(newtest, drop_first = True)

In [None]:
dummytrain.shape, dummytest.shape

In [None]:
dummytrain.columns

In [None]:
# Magic14, total_amount and magic 9
dummytrain.drop(["total_amount", "magic14", "magic9"], axis = 1,
               inplace = True)

dummytest.drop(["total_amount", "magic14", "magic9"], axis = 1,
               inplace = True)

### Model Building

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

kfold = KFold(n_splits= 5, shuffle = True, random_state = 42)

X = dummytrain.drop("calculated_total_amount", axis = 1)
y = dummytrain.calculated_total_amount

In [None]:
lr = LinearRegression()
gbm = GradientBoostingRegressor()
pred = []
for train_index, test_index in kfold.split(X, y):
    xtrain = X.iloc[train_index]
    ytrain = y.iloc[train_index]
    pred.append(gbm.fit(xtrain, ytrain).predict(dummytest)) 

In [None]:
# Linear Regression
final = pd.DataFrame(pred).T.mean(axis = 1)


In [None]:
# GBM
final_gbm = pd.DataFrame(pred).T.mean(axis = 1)

In [None]:
submission["calculated_total_amount"] = final_gbm

In [None]:
submission.to_csv("GBM_Taxi.csv", index=False) # 19.18

In [None]:
cd