In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
 
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [2]:
joined_df = pd.read_csv('train.csv' , index_col= 'tripid')
print('joined_df.shape : ', joined_df.shape)

joined_df.shape :  (17176, 13)


In [3]:
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [4]:
joined_df['label'] = joined_df['label'].map( {'correct':1, 'incorrect':0} )
joined_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,1
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,1
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,1
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,1
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,1


In [5]:
features_df = joined_df.drop(['label'], axis = 1)
features_df = features_df.drop(['pickup_time'], axis = 1)
features_df = features_df.drop(['drop_time'], axis = 1)
features_df = features_df.drop(['meter_waiting_fare'], axis = 1)

In [6]:
labels=joined_df.drop(['additional_fare'], axis = 1)
labels=labels.drop(['duration'], axis = 1)
labels=labels.drop(['meter_waiting'], axis = 1)
labels=labels.drop(['meter_waiting_fare'], axis = 1)
labels=labels.drop(['meter_waiting_till_pickup'], axis = 1)
labels=labels.drop(['pick_lat'], axis = 1)
labels=labels.drop(['pick_lon'], axis = 1)
labels=labels.drop(['drop_lat'], axis = 1)
labels=labels.drop(['drop_lon'], axis = 1)
labels=labels.drop(['fare'], axis = 1)
labels=labels.drop(['pickup_time'], axis = 1)
labels=labels.drop(['drop_time'], axis = 1)

In [7]:
features_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
189123628,10.5,834.0,56.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32
189125358,10.5,791.0,47.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.5,1087.0,80.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.5,598.0,271.0,68.0,6.9257,79.8895,6.92748,79.8971,82.3
189128020,,,,,6.87441,79.8615,6.84478,79.929,358.39


In [8]:
labels.head()

Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
189123628,1
189125358,1
189125719,1
189127273,1
189128020,1


In [9]:
numeric_features = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_till_pickup', 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [11]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [12]:
clf.fit(features_df,labels)
print("model score: %.3f" % clf.score(features_df, labels))

model score: 0.908


In [13]:
test_df = pd.read_csv('test.csv' , index_col= 'tripid')
test_df = test_df.drop(['pickup_time'], axis = 1)
test_df = test_df.drop(['drop_time'], axis = 1)
test_df = test_df.drop(['meter_waiting_fare'], axis = 1)
test_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
213284604,10.5,924,42,148,6.83454,79.8750,6.77490,79.8840,289.27
213286352,10.5,4249,20,91,6.91168,79.8723,6.55091,79.9706,1912.70
213293973,10.5,1552,255,23,6.92145,79.8478,6.90539,79.8989,394.00
213294622,10.5,462,16,198,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,69,6.97968,79.9130,6.98875,79.8914,147.47
...,...,...,...,...,...,...,...,...,...
222856243,10.5,1723,429,3,6.85103,79.9567,6.85588,79.9214,388.48
222857785,10.5,1378,80,125,6.91293,79.9656,6.92112,79.8980,379.85
222858416,10.5,418,56,93,6.85718,79.9081,6.83868,79.9083,112.79
222858691,10.5,1604,548,17,6.91289,79.8846,6.93159,79.9145,248.46


In [14]:
test_probas = clf.predict_proba(test_df)
result = test_probas > 0.7
result.astype(np.int)
result

array([[False,  True],
       [False, False],
       [False,  True],
       ...,
       [False,  True],
       [False,  True],
       [False,  True]])

In [15]:
submission_df = pd.read_csv("sample_submission.csv", 
                            index_col="tripid")

In [16]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [17]:
np.testing.assert_array_equal(test_df.index.values, 
                              submission_df.index.values)

In [18]:
submission_df["prediction"] = result[:, 1]

In [19]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,True
213286352,False
213293973,True
213294622,True
213298687,True


In [20]:
submission_df.to_csv('results2.csv', index=True)