## 1. Setup


In [23]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import GaussianNB


from google.colab import drive  
drive.mount('/content/gdrive')
train_set= pd.read_csv('/content/gdrive/My Drive/Ride_Fare/train.csv', index_col="tripid")
test_set= pd.read_csv('/content/gdrive/My Drive/Ride_Fare/test.csv', index_col="tripid")    


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 2. Feature PreProcessing

In [24]:
#Label encoding
train_set["label"] = train_set["label"].map({"incorrect": 0, "correct":1})

#fill misding values of 'duration' column using 'pickup_time' and 'drop_time' values
train_set["pickup"] = pd.to_datetime(train_set["pickup_time"])
train_set["drop"] = pd.to_datetime(train_set["drop_time"])
train_set['difference'] = (train_set["drop"] - train_set["pickup"])/np.timedelta64(1,'s')
train_set['duration'] = train_set['duration'].fillna(train_set['difference'])

# define new feature 'distance'

def haversine_vectorize(lon1, lat1, lon2, lat2):
 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
 
    newlon = lon2 - lon1
    newlat = lat2 - lat1
 
    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
 
    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

train_set['distance'] = haversine_vectorize(train_set['pick_lon'],train_set['pick_lat'],train_set['drop_lon'],train_set['drop_lat'])

# dropping features
train_set = train_set.drop(['pick_lat','pick_lon','drop_lat','drop_lon','pickup_time','drop_time','pickup','drop','difference','meter_waiting_till_pickup'],axis='columns')

train_set.info()

X = train_set.drop(['label'],axis='columns')
Y = train_set['label']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17176 entries, 189123628 to 213817296
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   additional_fare     16974 non-null  float64
 1   duration            17176 non-null  float64
 2   meter_waiting       16974 non-null  float64
 3   meter_waiting_fare  16974 non-null  float64
 4   fare                17039 non-null  float64
 5   label               17176 non-null  int64  
 6   distance            17176 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 1.0 MB


## 3. Model Creation and Training


In [25]:
numeric_cols = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'fare', 'distance']

numeric_preprocessing_steps = Pipeline(steps = [
    ('standard_scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean'))])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [26]:
from sklearn.ensemble import StackingClassifier

# get the models to evaluate
level0 = list()
level0.append(('xg',XGBClassifier(n_estimators=500,subsample=0.14))) 
level0.append(('mlp',MLPClassifier(hidden_layer_sizes=(50,100,50), max_iter=1000)))
level0.append(('dt', RandomForestClassifier(n_estimators = 100,max_features = 'log2')))

# define meta learner model
level1 = LogisticRegression(penalty="l2", C=3)
 
estimator = StackingClassifier(estimators=level0, final_estimator=level1, cv=10)

fullPipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', estimator)])

In [27]:
X_train, X_eval, y_train, y_eval = train_test_split(X, Y, test_size=0.33, shuffle=True, stratify=Y, random_state=6)

# Train model
fullPipe.fit(X_train, np.ravel(y_train))

preds = fullPipe.predict(X_eval)
y_preds = pd.DataFrame({"label": preds},index = y_eval.index)

from sklearn.metrics import f1_score
print(f1_score(y_eval, y_preds))

0.9750820938767626


In [28]:
fullPipe.fit(X, Y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                            

## 4. Test data PreProcessing


In [29]:
#check missing values(no missing values)
test_set.isnull().sum()

#define new feature 'distance'
test_set['distance'] = haversine_vectorize(test_set['pick_lon'],test_set['pick_lat'],test_set['drop_lon'],test_set['drop_lat'])

# dropping features
test_set = test_set.drop(['pick_lat','pick_lon','drop_lat','drop_lon','pickup_time','drop_time','meter_waiting_till_pickup'],axis='columns')

test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8576 entries, 213284604 to 222860703
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   additional_fare     8576 non-null   float64
 1   duration            8576 non-null   int64  
 2   meter_waiting       8576 non-null   int64  
 3   meter_waiting_fare  8576 non-null   float64
 4   fare                8576 non-null   float64
 5   distance            8576 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 469.0 KB


## 5. Make Predictions


In [30]:
result = fullPipe.predict(test_set)
test_set['prediction'] = result

submission_df = test_set.drop(['additional_fare','distance','duration','fare','meter_waiting','meter_waiting_fare'],axis='columns')

submission_df.head()
datapath = '/content/gdrive/My Drive/Ride_Fare/'
submission_df.to_csv(datapath +'submission.csv', index=True)