In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor,AdaBoostRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
 
from tpot import TPOTRegressor
    
    
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [2]:
import zipfile
sample_zip = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip')
train_zip = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip')
test_zip = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/test.zip')

train_df = pd.read_csv(train_zip.open("train.csv"))
test_df = pd.read_csv(test_zip.open("test.csv"))
sample_df = pd.read_csv(sample_zip.open("sample_submission.csv"))

In [3]:
train = train_df.copy()
test = test_df.copy()

In [4]:
train_id = train[["id"]]
test_id = test[["id"]]

In [5]:
train=train.set_index("id")
test=test.set_index("id")

In [6]:
train["pickup_datetime"] = pd.to_datetime(train["pickup_datetime"])
test["pickup_datetime"] = pd.to_datetime(test["pickup_datetime"])

In [7]:
train["month"] = train["pickup_datetime"].dt.month
train["day"] = train["pickup_datetime"].dt.day
train["hour"] = train["pickup_datetime"].dt.hour
train["minute"] = train["pickup_datetime"].dt.minute
train["second"] = train["pickup_datetime"].dt.second

In [8]:
test["month"] = test["pickup_datetime"].dt.month
test["day"] = test["pickup_datetime"].dt.day
test["hour"] = test["pickup_datetime"].dt.hour
test["minute"] = test["pickup_datetime"].dt.minute
test["second"] = test["pickup_datetime"].dt.second

In [9]:
# encoder = LabelEncoder()
# train["store_and_fwd_flag"] = encoder.fit_transform(train["store_and_fwd_flag"])
# test["store_and_fwd_flag"] = encoder.transform(test["store_and_fwd_flag"])

In [10]:
import pandas as pd
import math

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a)) 

    # Radius of earth in kilometers is 6371
    km = 6371 * c
    return km

In [11]:
train['distance'] = train.apply(lambda row: haversine(row['pickup_longitude'],row['pickup_latitude'],
                                                      row['dropoff_longitude'],row['dropoff_latitude']),
                                axis=1)

In [12]:
test['distance'] = test.apply(lambda row: haversine(row['pickup_longitude'],row['pickup_latitude'],
                                                      row['dropoff_longitude'],row['dropoff_latitude']),
                                axis=1)

In [13]:
train = train.dropna()

In [14]:
train = train.drop(columns=["dropoff_datetime","pickup_datetime",
                            'pickup_longitude', 'pickup_latitude',
                            'dropoff_longitude', 'dropoff_latitude'])

In [15]:
test = test.drop(columns=["pickup_datetime",
                            'pickup_longitude', 'pickup_latitude',
                            'dropoff_longitude', 'dropoff_latitude'])

In [16]:
X = train.drop(columns="trip_duration")
y = train[["trip_duration"]]

In [17]:
encoder = LabelEncoder()

In [18]:
X["store_and_fwd_flag"] = encoder.fit_transform(X["store_and_fwd_flag"])
test["store_and_fwd_flag"] = encoder.transform(test["store_and_fwd_flag"])

In [19]:
scaler = StandardScaler()

In [20]:
column_X = X.columns

In [21]:
column_X


Index(['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'month', 'day',
       'hour', 'minute', 'second', 'distance'],
      dtype='object')

In [22]:
X = pd.DataFrame(scaler.fit_transform(X), columns=column_X)

In [23]:
test = pd.DataFrame(scaler.transform(test), columns=column_X)

In [24]:
X

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,month,day,hour,minute,second,distance
0,0.932380,-0.505637,-0.074471,-0.307440,-0.172813,0.530263,-0.322670,1.473824,-0.452072
1,-1.072524,-0.505637,-0.074471,1.477173,-0.402616,-2.126116,0.774030,0.319080,-0.380622
2,0.932380,-0.505637,-0.074471,-1.497182,0.401692,-0.407283,0.312262,-0.316030,0.685258
3,0.932380,-0.505637,-0.074471,0.287431,-1.092023,0.842778,0.139099,0.088131,-0.455103
4,0.932380,-0.505637,-0.074471,-0.307440,1.206001,-0.094768,0.023657,1.473824,-0.524207
...,...,...,...,...,...,...,...,...,...
1458639,0.932380,1.777048,-0.074471,0.287431,-0.862220,-0.094768,0.081378,-1.470775,-0.515714
1458640,-1.072524,-0.505637,-0.074471,-1.497182,-0.632418,-1.032313,0.312262,-0.835665,0.607227
1458641,0.932380,-0.505637,-0.074471,0.287431,0.746396,-1.188570,1.582124,0.665503,1.020297
1458642,-1.072524,-0.505637,-0.074471,-1.497182,-1.206924,0.217747,1.524403,-0.200556,-0.546557


In [25]:
X = X.drop(columns=["vendor_id","store_and_fwd_flag"])

In [26]:
X

Unnamed: 0,passenger_count,month,day,hour,minute,second,distance
0,-0.505637,-0.307440,-0.172813,0.530263,-0.322670,1.473824,-0.452072
1,-0.505637,1.477173,-0.402616,-2.126116,0.774030,0.319080,-0.380622
2,-0.505637,-1.497182,0.401692,-0.407283,0.312262,-0.316030,0.685258
3,-0.505637,0.287431,-1.092023,0.842778,0.139099,0.088131,-0.455103
4,-0.505637,-0.307440,1.206001,-0.094768,0.023657,1.473824,-0.524207
...,...,...,...,...,...,...,...
1458639,1.777048,0.287431,-0.862220,-0.094768,0.081378,-1.470775,-0.515714
1458640,-0.505637,-1.497182,-0.632418,-1.032313,0.312262,-0.835665,0.607227
1458641,-0.505637,0.287431,0.746396,-1.188570,1.582124,0.665503,1.020297
1458642,-0.505637,-1.497182,-1.206924,0.217747,1.524403,-0.200556,-0.546557


In [27]:
test = test.drop(columns=["vendor_id","store_and_fwd_flag"])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
linear = LinearRegression()

linear.fit(X_train,y_train)
print(f"Training score {linear.score(X_train,y_train)}")
y_pred = linear.predict(X_test)
print(f"R2: {r2_score(y_test,y_pred)}")

Training score 0.007767402711363336
R2: 0.024904681556639052


In [30]:
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_test)
print(f"Training score {cat_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

Learning rate set to 0.12496
0:	learn: 5618.8276823	total: 170ms	remaining: 2m 49s
1:	learn: 5613.9313929	total: 276ms	remaining: 2m 17s
2:	learn: 5610.2169223	total: 377ms	remaining: 2m 5s
3:	learn: 5607.1308465	total: 495ms	remaining: 2m 3s
4:	learn: 5604.8228868	total: 586ms	remaining: 1m 56s
5:	learn: 5602.9007984	total: 683ms	remaining: 1m 53s
6:	learn: 5601.3367488	total: 776ms	remaining: 1m 50s
7:	learn: 5600.2360957	total: 867ms	remaining: 1m 47s
8:	learn: 5598.4370227	total: 954ms	remaining: 1m 45s
9:	learn: 5597.5793780	total: 1.05s	remaining: 1m 43s
10:	learn: 5596.8480932	total: 1.15s	remaining: 1m 43s
11:	learn: 5596.2702999	total: 1.25s	remaining: 1m 42s
12:	learn: 5595.7461867	total: 1.34s	remaining: 1m 41s
13:	learn: 5594.8351310	total: 1.43s	remaining: 1m 40s
14:	learn: 5584.9319788	total: 1.51s	remaining: 1m 39s
15:	learn: 5584.4141670	total: 1.6s	remaining: 1m 38s
16:	learn: 5576.8358874	total: 1.69s	remaining: 1m 37s
17:	learn: 5576.5970869	total: 1.78s	remaining: 1

In [31]:
xg_model = XGBRegressor()
xg_model.fit(X_train, y_train)

y_pred = xg_model.predict(X_test)
print(f"Training score {xg_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

Training score 0.5346903874821604
R2: -0.2218986519506101


In [32]:
LG_model = LGBMRegressor()
LG_model.fit(X_train, y_train)

y_pred = LG_model.predict(X_test)
print(f"Training score {LG_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 1166915, number of used features: 7
[LightGBM] [Info] Start training from score 959.273585
Training score 0.18774022500838716
R2: -0.026016004834064077


In [38]:
cat_model = CatBoostRegressor()
cat_model.fit(X, y)

y_pred = cat_model.predict(test)
print(f"Training score {cat_model.score(X,y)}")


Learning rate set to 0.129444
0:	learn: 5230.5320251	total: 133ms	remaining: 2m 13s
1:	learn: 5225.2215445	total: 253ms	remaining: 2m 6s
2:	learn: 5221.1027634	total: 373ms	remaining: 2m 4s
3:	learn: 5217.7982373	total: 489ms	remaining: 2m 1s
4:	learn: 5215.4277656	total: 607ms	remaining: 2m
5:	learn: 5213.4310642	total: 733ms	remaining: 2m 1s
6:	learn: 5211.8703935	total: 848ms	remaining: 2m
7:	learn: 5210.6706952	total: 960ms	remaining: 1m 59s
8:	learn: 5209.7343020	total: 1.07s	remaining: 1m 58s
9:	learn: 5208.4430406	total: 1.18s	remaining: 1m 56s
10:	learn: 5207.8735683	total: 1.29s	remaining: 1m 56s
11:	learn: 5207.3287039	total: 1.41s	remaining: 1m 56s
12:	learn: 5205.3424077	total: 1.52s	remaining: 1m 55s
13:	learn: 5205.0803990	total: 1.64s	remaining: 1m 55s
14:	learn: 5204.5680667	total: 1.76s	remaining: 1m 55s
15:	learn: 5204.2718815	total: 1.87s	remaining: 1m 55s
16:	learn: 5203.0084104	total: 1.99s	remaining: 1m 54s
17:	learn: 5202.5862668	total: 2.1s	remaining: 1m 54s
18:

In [36]:
# xg_model = XGBRegressor()
# xg_model.fit(X, y)

# y_pred = xg_model.predict(test)
# print(f"Training score {xg_model.score(X,y)}")

Training score 0.44317546322181534


In [37]:
# linear = LinearRegression()

# linear.fit(X, y)
# print(f"Training score {linear.score(X, y)}")
# y_pred = linear.predict(test)

Training score 0.009094076272950113


In [39]:
output = test_id
output["trip_duration"] = pd.DataFrame(y_pred)
output = output.set_index("id")


In [40]:
output.to_csv("trip_duration_XG_8.csv")