In [33]:
 from pathlib import Path
import pandas as pd
import numpy as np
 
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score 

np.random.seed(0)

In [34]:
joined_df = pd.read_csv('train.csv' , index_col= 'tripid')
print('joined_df.shape : ', joined_df.shape)

joined_df.shape :  (17176, 13)


In [35]:
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [36]:
joined_df['label'] = joined_df['label'].map( {'correct':True, 'incorrect':False} )
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,True
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,True
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,True
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,True
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,True


In [37]:
features_df = joined_df.drop(['label'], axis = 1)
features_df = features_df.drop(['pickup_time'], axis = 1)
features_df = features_df.drop(['drop_time'], axis = 1)

In [38]:
features_df["distance"]=((features_df["pick_lat"]-features_df["drop_lat"])**2)+((features_df["pick_lon"]-features_df["drop_lon"])**2)

In [39]:
features_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
189123628,10.5,834.0,56.0,0.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32,0.002104
189125358,10.5,791.0,47.0,0.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85,0.000812
189125719,10.5,1087.0,80.0,0.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64,0.003251
189127273,10.5,598.0,271.0,15.6638,68.0,6.9257,79.8895,6.92748,79.8971,82.3,6.1e-05
189128020,,,,,,6.87441,79.8615,6.84478,79.929,358.39,0.005434


In [40]:
features_df = features_df.drop(['pick_lat'], axis = 1)
features_df = features_df.drop(['drop_lat'], axis = 1)
features_df = features_df.drop(['pick_lon'], axis = 1)
features_df = features_df.drop(['drop_lon'], axis = 1)

In [41]:
features_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
189123628,10.5,834.0,56.0,0.0,64.0,270.32,0.002104
189125358,10.5,791.0,47.0,0.0,134.0,197.85,0.000812
189125719,10.5,1087.0,80.0,0.0,61.0,301.64,0.003251
189127273,10.5,598.0,271.0,15.6638,68.0,82.3,6.1e-05
189128020,,,,,,358.39,0.005434


In [42]:
labels=joined_df.drop(['additional_fare'], axis = 1)
labels=labels.drop(['duration'], axis = 1)
labels=labels.drop(['meter_waiting'], axis = 1)
labels=labels.drop(['meter_waiting_fare'], axis = 1)
labels=labels.drop(['meter_waiting_till_pickup'], axis = 1)
labels=labels.drop(['pick_lat'], axis = 1)
labels=labels.drop(['pick_lon'], axis = 1)
labels=labels.drop(['drop_lat'], axis = 1)
labels=labels.drop(['drop_lon'], axis = 1)
labels=labels.drop(['fare'], axis = 1)
labels=labels.drop(['pickup_time'], axis = 1)
labels=labels.drop(['drop_time'], axis = 1)

In [43]:
labels.head()

Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
189123628,True
189125358,True
189125719,True
189127273,True
189128020,True


In [44]:
numeric_features = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup','distance' , 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [46]:
RANDOM_SEED=0
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels,
    test_size=0.1,
    shuffle=True,
    stratify=labels,
    random_state=RANDOM_SEED
)

In [47]:
X_train.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
195426968,10.5,435.0,101.0,0.0,10.0,397.96,0.0001420201
207461765,10.5,742.0,294.0,17.1402,6.0,121.32,0.0001622601
212629700,10.5,2353.0,520.0,30.056,22.0,486.04,1.01e-08
198439131,10.5,1520.0,490.0,9.0746,211.0,221.46,0.00176533
205426320,10.5,4619.0,971.0,56.1238,163.0,804.21,0.00177849


In [48]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [49]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_train, y_train))

model score: 0.909


In [50]:
val_pred = clf.predict_proba(X_eval)
val_pred

array([[0.06472681, 0.93527319],
       [0.06882696, 0.93117304],
       [0.09188537, 0.90811463],
       ...,
       [0.09805344, 0.90194656],
       [0.05969608, 0.94030392],
       [0.08706466, 0.91293534]])

In [51]:
t=0.7
val_pred = val_pred > t
val_pred.astype(np.int)
y_pred=pd.DataFrame(val_pred[:,1])
y_pred

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,True
...,...
1713,True
1714,True
1715,True
1716,True


In [52]:
f1_score(y_eval, y_pred, average='micro')

0.9109429569266589

In [53]:
test_df = pd.read_csv('test.csv' , index_col= 'tripid')
test_df = test_df.drop(['pickup_time'], axis = 1)
test_df = test_df.drop(['drop_time'], axis = 1)
test_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
213284604,10.5,924,42,2.44860,148,6.83454,79.8750,6.77490,79.8840,289.27
213286352,10.5,4249,20,0.00000,91,6.91168,79.8723,6.55091,79.9706,1912.70
213293973,10.5,1552,255,2.65880,23,6.92145,79.8478,6.90539,79.8989,394.00
213294622,10.5,462,16,0.00000,198,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.36920,69,6.97968,79.9130,6.98875,79.8914,147.47
...,...,...,...,...,...,...,...,...,...,...
222856243,10.5,1723,429,24.83332,3,6.85103,79.9567,6.85588,79.9214,388.48
222857785,10.5,1378,80,0.00000,125,6.91293,79.9656,6.92112,79.8980,379.85
222858416,10.5,418,56,3.28440,93,6.85718,79.9081,6.83868,79.9083,112.79
222858691,10.5,1604,548,31.67440,17,6.91289,79.8846,6.93159,79.9145,248.46


In [54]:
test_df["distance"]=((test_df["pick_lat"]-test_df["drop_lat"])**2)+((test_df["pick_lon"]-test_df["drop_lon"])**2)

In [55]:
test_df = test_df.drop(['pick_lat'], axis = 1)
test_df = test_df.drop(['drop_lat'], axis = 1)
test_df = test_df.drop(['pick_lon'], axis = 1)
test_df = test_df.drop(['drop_lon'], axis = 1)

In [56]:
test_probas = clf.predict_proba(test_df)
result = test_probas > t
result.astype(np.int)
result

array([[False,  True],
       [False,  True],
       [False,  True],
       ...,
       [False,  True],
       [False,  True],
       [False,  True]])

In [57]:
submission_df = pd.read_csv("sample_submission.csv", 
                            index_col="tripid")

In [58]:
np.testing.assert_array_equal(test_df.index.values, 
                              submission_df.index.values)

In [59]:
submission_df["prediction"] = result[:, 1]

In [60]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,True
213286352,True
213293973,True
213294622,True
213298687,True


In [61]:
submission_df.to_csv('results3.csv', index=True)