In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

from subprocess import check_output

In [2]:
train = pd.read_csv("../input/train.zip")

In [3]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
trip_duration_min = \
train.groupby('store_and_fwd_flag')['trip_duration'].min()["Y"]
trip_duration_max = \
train.groupby('store_and_fwd_flag')['trip_duration'].max()["Y"]

In [5]:
def drop_duration(df):
    df.loc[df.trip_duration < trip_duration_min,'trip_duration'] = np.nan
    value=df.trip_duration.min()
    df.trip_duration.fillna(value=value, inplace=True) 

    df.loc[df.trip_duration > trip_duration_max,'trip_duration'] = np.nan
    value=df.trip_duration.max()
    df.trip_duration.fillna(value=value, inplace=True)
    return

In [6]:
drop_duration(train)

In [7]:
def conf_int_duration(df):
    
    conf_int_duration = np.percentile(df.trip_duration, [2.5,97.5])    
    df.loc[df.trip_duration < conf_int_duration[0],'trip_duration'] = np.nan
    value=df.trip_duration.min()
    df.trip_duration.fillna(value=value, inplace=True) 
    
    df.loc[df.trip_duration > conf_int_duration[1],'trip_duration'] = np.nan
    value=df.trip_duration.max()
    df.trip_duration.fillna(value=value, inplace=True)
        
    return

In [8]:
conf_int_duration(train)

In [9]:
test = pd.read_csv("../input/test.zip")

In [10]:
result = pd.concat([train, test])

In [11]:
pickup_long_min = \
result.groupby('store_and_fwd_flag')['pickup_longitude'].min()["Y"]
pickup_long_max = \
result.groupby('store_and_fwd_flag')['pickup_longitude'].max()["Y"]

dropoff_long_min = \
result.groupby('store_and_fwd_flag')['dropoff_longitude'].min()["Y"]
dropoff_long_max = \
result.groupby('store_and_fwd_flag')['dropoff_longitude'].max()["Y"]

pickup_lat_min = \
result.groupby('store_and_fwd_flag')['pickup_latitude'].min()["Y"]
pickup_lat_max = \
result.groupby('store_and_fwd_flag')['pickup_latitude'].max()["Y"]

dropoff_lat_min = \
result.groupby('store_and_fwd_flag')['dropoff_latitude'].min()["Y"]
dropoff_lat_max = \
result.groupby('store_and_fwd_flag')['dropoff_latitude'].max()["Y"]

In [12]:
result['pickup_longitude'] = result.pickup_longitude.round(5)
result['pickup_latitude'] = result.pickup_latitude.round(5)


result.loc[result.pickup_latitude <  pickup_lat_min, 'pickup_latitude'] = np.nan
value=result.pickup_latitude.min()
result.pickup_latitude.fillna(value=value, inplace=True)

result.loc[result.pickup_latitude > pickup_lat_max, 'pickup_latitude'] = np.nan
value=result.pickup_latitude.max()
result.pickup_latitude.fillna(value=value, inplace=True)

result.loc[result.pickup_longitude < pickup_long_min, 'pickup_longitude'] = np.nan
value=result.pickup_longitude.min()
result.pickup_longitude.fillna(value=value, inplace=True)

result.loc[result.pickup_longitude > pickup_long_max, 'pickup_longitude'] = np.nan
value=result.pickup_longitude.max()
result.pickup_longitude.fillna(value=value, inplace=True)


result['dropoff_longitude'] = result.dropoff_longitude.round(5)
result['dropoff_latitude'] = result.dropoff_latitude.round(5)


result.loc[result.dropoff_latitude < dropoff_lat_min, 'dropoff_latitude'] = np.nan
value=result.dropoff_latitude.min()
result.dropoff_latitude.fillna(value=value, inplace=True)

result.loc[result.dropoff_latitude > dropoff_lat_max, 'dropoff_latitude'] = np.nan
value=result.dropoff_latitude.max()
result.dropoff_latitude.fillna(value=value, inplace=True)

result.loc[result.dropoff_longitude < dropoff_long_min, 'dropoff_longitude'] = np.nan
value=result.dropoff_longitude.min()
result.dropoff_longitude.fillna(value=value, inplace=True)

result.loc[result.dropoff_longitude > dropoff_long_max, 'dropoff_longitude'] = np.nan
value=result.dropoff_longitude.max()
result.dropoff_longitude.fillna(value=value, inplace=True)

In [13]:
AVG_EARTH_RADIUS = 6371 
def haversine(df, miles=True):
    """ Get the distance of routes by  the haversinus formula"""
    lat1, lng1, lat2, lng2 = (df.pickup_latitude[:], 
                              df.pickup_longitude[:], 
                              df.dropoff_latitude[:], 
                              df.dropoff_longitude[:])

    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))

    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat*0.5)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng*0.5)**2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    if miles:
        df['trip_distance'] = h * 0.621371  
        df['trip_distance'] = df.trip_distance.round(2)
        print(df.trip_distance.describe())
        return 
    else:
        df['trip_distance'] = h  
        df['trip_distance'] = df.trip_distance.round(2)
        print(df.trip_distance.describe())
        return

In [14]:
haversine(result, miles=True) 

count    2.083778e+06
mean     2.134070e+00
std      2.443593e+00
min      0.000000e+00
25%      7.700000e-01
50%      1.300000e+00
75%      2.410000e+00
max      5.067000e+01
Name: trip_distance, dtype: float64


In [15]:
def arrays_bearing(df):

    lats1, lngs1, lats2, lngs2 = (df['pickup_latitude'][:], 
                                  df['pickup_longitude'][:], 
                                  df['dropoff_latitude'][:], 
                                  df['dropoff_longitude'][:])
    lats1_rads = np.radians(lats1)
    lats2_rads = np.radians(lats2)
    lngs_delta_rads = np.radians(lngs2 - lngs1)
    
    y = np.sin(lngs_delta_rads) * np.cos(lats2_rads)
    x = np.cos(lats1_rads) * np.sin(lats2_rads) -                      \
    np.sin(lats1_rads) * np.cos(lats2_rads) * np.cos(lngs_delta_rads)
    df['bearing'] = np.degrees(np.arctan2(y, x))
    df['bearing'] = df.bearing.round(0)
    return

In [16]:
arrays_bearing(result)

In [17]:
result.drop('dropoff_datetime', axis=1, inplace=True)

In [18]:
result['pickup_datetime'] = pd.to_datetime(result.pickup_datetime)

In [19]:
result['days_in_month'] = result['pickup_datetime'][:].dt.days_in_month

In [20]:
result['weekday'] = result['pickup_datetime'].dt.weekday

In [21]:
result['hour'] = result['pickup_datetime'][:].dt.hour

In [22]:
result['minute'] = result['pickup_datetime'][:].dt.minute

In [23]:
result['month'] = result['pickup_datetime'][:].dt.month

In [24]:
result.drop(['pickup_datetime', 'store_and_fwd_flag', 'id'], axis=1, inplace=True)

In [25]:
test= result[result.trip_duration.isnull()]

In [26]:
test.drop('trip_duration', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
train = result[result.trip_duration.notnull()]

In [28]:
train.trip_duration = (train['trip_duration']+1).apply(np.log)
trip_duration = train.trip_duration.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [29]:
train.drop('trip_duration', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
print(test.shape, train.shape, trip_duration.shape)

((625134, 13), (1458644, 13), (1458644,))


In [31]:
X = train.values

In [32]:
y = trip_duration

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [35]:
X_train,X_val,y_train,y_val = \
train_test_split(X_train,y_train, random_state=41)

In [36]:
import xgboost as xgb

In [37]:
train_xgb  = xgb.DMatrix(X_train, label=y_train)
cv_xgb  = xgb.DMatrix(X_val , label=y_val)
evallist = [(train_xgb, 'train'), (cv_xgb, 'valid')]

In [40]:
param = {'eta': .5,
         'max_depth':6,
         'objective':'reg:linear',
         'subsample':1,
         'lambda '  :1,
         'colsample_bytree ':1,
         'tree_method': "gpu_hist"}  

model = xgb.train(param, train_xgb, num_boost_round=500, evals = evallist, maximize=False, 
                  verbose_eval=False)


In [41]:
X_real_test = test.values

In [42]:
test_xgb = xgb.DMatrix(X_real_test)
y_pred = model.predict(test_xgb)

In [43]:
y_pred[:10]

array([6.758745 , 6.352933 , 6.049654 , 6.7883134, 6.062416 , 6.789539 ,
       7.0310283, 6.4393992, 7.789947 , 6.3780966], dtype=float32)

In [44]:
y_pred = np.exp(y_pred[:]) - 1

In [45]:
print(y_pred[:10])

[ 860.5604   573.17426  422.9663   886.4156   428.41168  887.5037
 1130.1932   625.0306  2415.1897   587.8059 ]


In [47]:
submission = pd.read_csv('../input/sample_submission.csv', index_col=0, header=0)

In [110]:
submission.shape

(625134, 1)

In [111]:
y_pred.shape

(625134,)

In [112]:
submission.trip_duration = y_pred
submission.head(10)

Unnamed: 0_level_0,trip_duration
id,Unnamed: 1_level_1
id3004672,860.0
id3505355,710.0
id1217141,483.0
id2150126,1044.0
id1598245,368.0
id0668992,1076.0
id1765014,1102.0
id0898117,759.0
id3905224,2182.0
id1543102,600.0


In [114]:
submission.to_csv('submission.csv')