In [None]:
import math
import csv
import numpy as np
import pandas as pd
from sklearn import metrics
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


In [None]:
# Preparing training data
def compute_column(csv_file):
    with open(csv_file, newline='') as f:
        with open('refined_train.csv', 'w', newline='') as f2:
            writer = csv.writer(f2)
            rows = csv.reader(f)
            r = 6373.0
            for row in rows:
                lat1 = math.radians(float(row[8]))
                lat2 = math.radians(float(row[10]))
                delta_lat = abs(lat1 - lat2)
                delta_lon = abs(math.radians(float(row[9])) - math.radians(float(row[11])))
                a = math.sin(delta_lat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(delta_lon / 2) ** 2
                c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
                distance = r * c
          
                writer.writerow(row[0:8] + [delta_lat] + [delta_lon] + [distance] +row[12:14])


compute_column('/content/train.csv')

In [None]:
# Preparing testing data
def compute_column(csv_file):
    with open(csv_file, newline='') as f:
        with open('refined_test.csv', 'w', newline='') as f2:
            writer = csv.writer(f2)
            rows = csv.reader(f)
            r = 6373.0
            for row in rows:
                lat1 = math.radians(float(row[8]))
                lat2 = math.radians(float(row[10]))
                delta_lat = abs(lat1 - lat2)
                delta_lon = abs(math.radians(float(row[9])) - math.radians(float(row[11])))
                a = math.sin(delta_lat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(delta_lon / 2) ** 2
                c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
                distance = r * c
          
                writer.writerow(row[0:8] + [delta_lat] + [delta_lon] + [distance] +row[12:13])


compute_column('/content/test.csv')

In [None]:
data = pd.read_csv('/content/refined_train_header.csv',parse_dates=['pickup_time','drop_time'])

In [None]:
data1= pd.read_csv('/content/refined_test_header.csv',parse_dates=['pickup_time','drop_time'])

In [None]:
data.isnull().sum()

tripid                         0
additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
lat_move                       0
lon_move                       0
distance                       0
fare                         137
label                          0
dtype: int64

In [None]:
data1.isnull().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
lat_move                     0
lon_move                     0
distance                     0
fare                         0
dtype: int64

In [None]:
data = data.dropna()

In [None]:
# split training dataset into feature and target variables
feature_columns = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
                   'meter_waiting_till_pickup','lat_move','lon_move','distance']

x_train_Full = data[feature_columns]

labelToBinary = {'correct': 1, 'incorrect': 0}
data.label = [labelToBinary[item] for item in data.label]
y_train_Full = data.label


In [None]:
feature_test_columns = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
                   'meter_waiting_till_pickup','lat_move','lon_move','distance']
                   

In [None]:
x_valid = data1[feature_test_columns]

In [None]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler
#from imblearn.over_sampling import RandomOverSampler
#oversample = RandomOverSampler(sampling_strategy='minority')
oversample = RandomOverSampler(sampling_strategy=0.5)
#undersample = RandomUnderSampler(sampling_strategy=0.1)
#X_over, y_over = undersample.fit_resample(x_train_Full, y_train_Full)



In [None]:
X_over, y_over = oversample.fit_resample(x_train_Full, y_train_Full)



In [None]:
X_over=x_train_Full
y_over=y_train_Full

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.3, random_state=1)

In [None]:
from collections import Counter
print(Counter(y_over))

Counter({1: 15260, 0: 1526})


In [None]:
X_over[:4]

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,lat_move,lon_move,distance
0,10.5,834.0,56.0,0.0,64.0,0.000712,0.000367,5.094369
1,10.5,791.0,47.0,0.0,134.0,0.000486,0.000106,3.169052
2,10.5,1087.0,80.0,0.0,61.0,0.000494,0.000864,6.307375
3,10.5,598.0,271.0,15.6638,68.0,3.1e-05,0.000133,0.862217


In [None]:
clf1 = XGBClassifier(learning_rate=0.01,
                    n_estimators=4500,
                    max_depth=6,
                    min_child_weight=6,
                    subsample=0.9,
                    colsample_bytree=0.9,
                    reg_alpha=0.004,
                    objective='binary:logistic',
                    scale_pos_weight=1,
                    seed=22)

In [None]:
# train classifier
clf1 = clf1.fit(x_train, y_train)

#classify the ride fare
y_predict = clf1.predict(x_test)

print('accuracy_score:', metrics.accuracy_score(y_test, y_predict))
print('f1_score:', metrics.f1_score(y_test, y_predict))

accuracy_score: 0.9217633042096902
f1_score: 0.9582095884599068


In [None]:
y_valid = clf1.predict(x_valid)

In [None]:
df = pd.DataFrame(y_valid, columns=['prediction'], index=data1['tripid'])
df.index.name = 'tripid'

df.to_csv('submission_XGB1.csv')