In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
 
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score 

np.random.seed(0)

In [2]:
joined_df = pd.read_csv('train.csv' , index_col= 'tripid')
print('joined_df.shape : ', joined_df.shape)

joined_df.shape :  (17176, 13)


In [3]:
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [4]:
joined_df['label'] = joined_df['label'].map( {'correct':True, 'incorrect':False} )
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,True
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,True
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,True
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,True
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,True


In [5]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [6]:
features_df = joined_df.drop(['label'], axis = 1)

In [7]:
features_df["distance"] =  distance(features_df["pick_lat"],features_df["pick_lon"],features_df["drop_lat"],features_df["drop_lon"])

In [8]:
features_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,3.164501
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,1.96854
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,3.917991
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,0.535588
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,5.062797


In [9]:
features_df = features_df.drop(['pick_lat'], axis = 1)
features_df = features_df.drop(['drop_lat'], axis = 1)
features_df = features_df.drop(['pick_lon'], axis = 1)
features_df = features_df.drop(['drop_lon'], axis = 1)

In [10]:
features_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,270.32,3.164501
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,197.85,1.96854
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,301.64,3.917991
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,82.3,0.535588
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,358.39,5.062797


In [11]:
features_df["pickup_time"] = pd.to_datetime(features_df["pickup_time"],errors = "coerce")

In [12]:
features_df["drop_time"] = pd.to_datetime(features_df["drop_time"],errors = "coerce")

In [13]:
features_df["pickup_time_hour"] = features_df["pickup_time"].dt.hour

In [14]:
features_df["pickup_time_minute"] = features_df["pickup_time"].dt.minute

In [15]:
features_df["drop_time_hour"] =features_df["drop_time"].dt.hour
features_df["drop_time_minute"] =features_df["drop_time"].dt.minute

In [16]:
features_df["pickup_time_day"] = features_df["pickup_time"].dt.day
features_df["drop_time_day"] = features_df["drop_time"].dt.day

In [17]:
features_df["effective_time"] = features_df["duration"]-features_df["meter_waiting"]

In [18]:
features_df = features_df.drop(['duration'], axis = 1)
features_df = features_df.drop(['pickup_time'], axis = 1)
features_df = features_df.drop(['drop_time'], axis = 1)

In [19]:
features_df.head()

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,pickup_time_day,drop_time_day,effective_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,56.0,0.0,64.0,270.32,3.164501,0,20,0,34,1,1,778.0
189125358,10.5,47.0,0.0,134.0,197.85,1.96854,0,56,1,9,1,1,744.0
189125719,10.5,80.0,0.0,61.0,301.64,3.917991,1,8,1,26,1,1,1007.0
189127273,10.5,271.0,15.6638,68.0,82.3,0.535588,2,27,2,37,1,1,327.0
189128020,,,,,358.39,5.062797,3,34,3,51,1,1,


In [20]:
labels=joined_df.drop(['additional_fare'], axis = 1)
labels=labels.drop(['duration'], axis = 1)
labels=labels.drop(['meter_waiting'], axis = 1)
labels=labels.drop(['meter_waiting_fare'], axis = 1)
labels=labels.drop(['meter_waiting_till_pickup'], axis = 1)
labels=labels.drop(['pick_lat'], axis = 1)
labels=labels.drop(['pick_lon'], axis = 1)
labels=labels.drop(['drop_lat'], axis = 1)
labels=labels.drop(['drop_lon'], axis = 1)
labels=labels.drop(['fare'], axis = 1)
labels=labels.drop(['pickup_time'], axis = 1)
labels=labels.drop(['drop_time'], axis = 1)

In [21]:
labels.head()

Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
189123628,True
189125358,True
189125719,True
189127273,True
189128020,True


In [22]:
numeric_features = ['additional_fare','meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup','fare','distance' ,'pickup_time_hour','pickup_time_minute','drop_time_hour','drop_time_minute', 'pickup_time_day','drop_time_day','effective_time']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [24]:
RANDOM_SEED=0
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels,
    test_size=0.3,
    shuffle=True,
    stratify=labels,
    random_state=RANDOM_SEED
)

In [25]:
X_train.head()

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,pickup_time_day,drop_time_day,effective_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
201666125,10.5,150.0,8.67,47.0,106.97,1.088325,13,23,13,33,18,18,434.0
191960496,10.5,165.0,9.6195,78.0,161.11,0.112979,16,47,16,59,14,14,571.0
195867417,10.5,169.0,9.7682,74.0,185.31,1.154363,9,1,9,17,29,29,795.0
196298365,40.5,339.0,0.0,27.0,276.02,2.544655,19,12,19,38,29,29,1247.0
207320819,10.5,341.0,19.8803,85.0,173.5,0.013009,11,18,11,33,6,6,574.0


In [26]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', svm.SVC())])

In [27]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_train, y_train))

model score: 0.918


In [28]:
val_pred = clf.predict(X_eval)
val_pred

array([ True,  True,  True, ...,  True,  True,  True])

In [29]:
t=0.5
val_pred = val_pred > t
val_pred.astype(np.int)
y_pred=pd.DataFrame(val_pred)
y_pred

Unnamed: 0,0
0,True
1,True
2,True
3,False
4,True
...,...
5148,True
5149,True
5150,True
5151,True


In [30]:
f1_score(y_eval, y_pred, average='micro')

0.9153890937318068

In [31]:
test_df = pd.read_csv('test.csv' , index_col= 'tripid')
test_df["distance"] =  distance(test_df["pick_lat"],test_df["pick_lon"],test_df["drop_lat"],test_df["drop_lon"])

In [32]:
test_df = test_df.drop(['pick_lat'], axis = 1)
test_df = test_df.drop(['drop_lat'], axis = 1)
test_df = test_df.drop(['pick_lon'], axis = 1)
test_df = test_df.drop(['drop_lon'], axis = 1)

In [33]:
test_df["pickup_time"] = pd.to_datetime(test_df["pickup_time"],errors = "coerce")
test_df["drop_time"] = pd.to_datetime(test_df["drop_time"],errors = "coerce")
test_df["pickup_time_hour"] = test_df["pickup_time"].dt.hour
test_df["pickup_time_minute"] = test_df["pickup_time"].dt.minute
test_df["drop_time_hour"] =test_df["drop_time"].dt.hour
test_df["drop_time_minute"] =test_df["drop_time"].dt.minute
test_df["pickup_time_day"] = test_df["pickup_time"].dt.day
test_df["drop_time_day"] = test_df["drop_time"].dt.day
test_df["effective_time"] = test_df["duration"]-test_df["meter_waiting"]
test_df = test_df.drop(['duration'], axis = 1)
test_df = test_df.drop(['pickup_time'], axis = 1)
test_df = test_df.drop(['drop_time'], axis = 1)

In [34]:
test_df.head()

Unnamed: 0_level_0,additional_fare,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance,pickup_time_hour,pickup_time_minute,drop_time_hour,drop_time_minute,pickup_time_day,drop_time_day,effective_time
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
213284604,10.5,42,2.4486,148,289.27,4.16673,0,38,0,53,1,1,882
213286352,10.5,20,0.0,91,1912.7,25.823263,1,2,2,13,1,1,4229
213293973,10.5,255,2.6588,23,394.0,3.676453,5,2,5,28,1,1,1297
213294622,10.5,16,0.0,198,154.32,2.051619,5,30,5,38,1,1,446
213298687,10.5,392,12.3692,69,147.47,1.608445,7,0,7,14,1,1,422


In [35]:
test_probas = clf.predict(test_df)
result = test_probas > t
result.astype(np.int)
result

array([ True,  True,  True, ...,  True,  True,  True])

In [36]:
submission_df = pd.read_csv("sample_submission.csv", 
                            index_col="tripid")

In [37]:
np.testing.assert_array_equal(test_df.index.values, 
                              submission_df.index.values)

In [38]:
submission_df["prediction"] = result

In [39]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,True
213286352,True
213293973,True
213294622,True
213298687,True


In [40]:
submission_df.to_csv('results5.csv', index=True)