In [0]:
first_time = False

if(first_time):
  from google.colab import drive
  drive.mount('/content/drive')

In [0]:
folderPath =  'drive/My Drive/University/SEM7/CS4622_Machine_Learning/Project_1/Fair_Classification/'

# Import Statements

In [0]:
if(first_time):
  !pip install geopy
  !pip install folium
  !pip install catboost

In [0]:
from pathlib import Path

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score,mean_squared_error,accuracy_score,f1_score
from sklearn import preprocessing

from catboost import CatBoostClassifier

pd.set_option("display.max_columns", 100)

# Load Dataset

In [0]:
features_df = pd.read_csv(
    folderPath+"train.csv", 
    index_col="tripid"
)

test_features_df = pd.read_csv(folderPath+"test.csv", 
                               index_col="tripid")

# Feature Engineering

## Flag location Outliers


In [97]:
print("Range of pickup Longitude is ", (min(test_features_df['pick_lon']),max(test_features_df['pick_lon'])))
print("Range of Pickup Latitude is ", (min(test_features_df['pick_lat']),max(test_features_df['pick_lat'])))

Range of pickup Longitude is  (79.8177, 80.7764)
Range of Pickup Latitude is  (5.94313, 8.42501)


In [0]:
boundary={'min_lng':79.818,
              'min_lat':5.94313,
              'max_lng':80.8055, 
              'max_lat':8.42501}

In [0]:
def flag_location_outlier(dataset, boundary):
  dataset.loc[~((dataset.pick_lon >= boundary['min_lng'] ) & (dataset.pick_lon <= boundary['max_lng']) &
            (dataset.pick_lat >= boundary['min_lat']) & (dataset.pick_lat <= boundary['max_lat']) &
            (dataset.drop_lon >= boundary['min_lng']) & (dataset.drop_lon <= boundary['max_lng']) &
            (dataset.drop_lat >=boundary['min_lat']) & (dataset.drop_lat <= boundary['max_lat'])),'is_outlier_loc']=1
  dataset.loc[((dataset.pick_lon >= boundary['min_lng'] ) & (dataset.pick_lon <= boundary['max_lng']) &
            (dataset.pick_lat >= boundary['min_lat']) & (dataset.pick_lat <= boundary['max_lat']) &
            (dataset.drop_lon >= boundary['min_lng']) & (dataset.drop_lon <= boundary['max_lng']) &
            (dataset.drop_lat >=boundary['min_lat']) & (dataset.drop_lat <= boundary['max_lat'])),'is_outlier_loc']=0
  is_outlier_loc = dataset['is_outlier_loc'].astype(int)
  dataset['is_outlier_loc'] = is_outlier_loc
  return dataset


In [0]:
features_df = flag_location_outlier(features_df,boundary)
test_features_df = flag_location_outlier(test_features_df,boundary)

## Trip Distance/Speed

In [0]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) 

features_df["distance"] =  distance(features_df["pick_lat"],features_df["pick_lon"],features_df["drop_lat"],features_df["drop_lon"])
test_features_df["distance"] =  distance(test_features_df["pick_lat"],test_features_df["pick_lon"],test_features_df["drop_lat"],test_features_df["drop_lon"])

## Find mean Values

In [0]:
features_df['effective_fare'] = features_df['fare'] - features_df['meter_waiting_fare']
test_features_df['effective_fare'] = test_features_df['fare'] - test_features_df['meter_waiting_fare']

additional_fare_mean = features_df['additional_fare'].mean()
meter_waiting_rate_mean = features_df['meter_waiting_fare'].mean()/features_df['meter_waiting'].mean()
effective_fare_rate = features_df['effective_fare'].mean()/features_df['distance'].mean()
effective_fare_rate_test = test_features_df['effective_fare'].mean()/test_features_df['distance'].mean()

## Fill null train set

### Seperate correct/incorrect

In [0]:
features_df_correct = features_df[features_df.label=='correct']
features_df_incorrect = features_df[features_df.label=='incorrect']

### Fill Null Seperately

In [0]:
features_df_correct['meter_waiting'] = features_df_correct['meter_waiting'].fillna(features_df_correct['meter_waiting'].mean())
features_df_incorrect['meter_waiting'] = features_df_incorrect['meter_waiting'].fillna(features_df_incorrect['meter_waiting'].mean())

features_df_correct['meter_waiting_fare'] = features_df_correct['meter_waiting_fare'].fillna(features_df_correct['meter_waiting_fare'].mean())
features_df_incorrect['meter_waiting_fare'] = features_df_incorrect['meter_waiting_fare'].fillna(features_df_incorrect['meter_waiting_fare'].mean())

features_df_correct['fare'] = features_df_correct['fare'].fillna(features_df_correct['distance']*effective_fare_rate+features_df_correct['additional_fare']+features_df_correct['meter_waiting_fare'])
features_df_incorrect['fare'] = features_df_incorrect['fare'].fillna(features_df_incorrect['distance']*effective_fare_rate_test+features_df_incorrect['additional_fare']+features_df_incorrect['meter_waiting_fare'])

### Concat correct/Incorrect

In [0]:
features_df = pd.concat([features_df_correct,features_df_incorrect],sort=True)

### Fill null commonly

In [0]:
features_df['additional_fare'] = features_df['additional_fare'].fillna(features_df['additional_fare'].mode().iloc[0])

features_df["pickup_time"] = pd.to_datetime(features_df["pickup_time"],errors = "coerce")
features_df["drop_time"] = pd.to_datetime(features_df["drop_time"],errors = "coerce")
features_df['duration'] = features_df['duration'].fillna((features_df['drop_time'] - features_df['pickup_time']).astype('timedelta64[s]'))



##Fill Null if Any remaining

In [0]:
features_df = features_df.fillna(features_df.mean())
test_features_df = test_features_df.fillna(test_features_df.mean())

## Remove Duplicates

In [0]:
features_df = features_df.drop_duplicates()

## Flag Outliers

In [109]:
mean, std  =features_df.fare.mean() , features_df.fare.std()
features_df['fare_outlier'] = features_df['fare'].apply(lambda x: 0 if ((np.abs(x-mean) <= (3*std)) or x<=0 or x is None) else 1)
test_features_df['fare_outlier'] = test_features_df['fare'].apply(lambda x: 0 if ((np.abs(x-mean) <= (3*std)) or x<=0 or x is None ) else 1)

print (features_df[features_df.fare_outlier == 1].shape[0])

88


## Set features for Time/Duration 

In [0]:
features_df["pickup_time"] = pd.to_datetime(features_df["pickup_time"],errors = "coerce")
features_df["drop_time"] = pd.to_datetime(features_df["drop_time"],errors = "coerce")
features_df["pickup_time_hour"] = features_df["pickup_time"].dt.hour
features_df["pickup_time_minute"] = features_df["pickup_time"].dt.minute
features_df["drop_time_hour"] =features_df["drop_time"].dt.hour
features_df["drop_time_minute"] =features_df["drop_time"].dt.minute
features_df["pickup_time_day"] = features_df["pickup_time"].dt.day
features_df["drop_time_day"] = features_df["drop_time"].dt.day
features_df["effective_time"] = features_df["duration"]-features_df["meter_waiting"]

test_features_df["pickup_time"] = pd.to_datetime(test_features_df["pickup_time"],errors = "coerce")
test_features_df["drop_time"] = pd.to_datetime(test_features_df["drop_time"],errors = "coerce")
test_features_df["pickup_time_hour"] = test_features_df["pickup_time"].dt.hour
test_features_df["pickup_time_minute"] = test_features_df["pickup_time"].dt.minute
test_features_df["drop_time_hour"] =test_features_df["drop_time"].dt.hour
test_features_df["drop_time_minute"] =test_features_df["drop_time"].dt.minute
test_features_df["pickup_time_day"] = test_features_df["pickup_time"].dt.day
test_features_df["drop_time_day"] = test_features_df["drop_time"].dt.day
test_features_df["effective_time"] = test_features_df["duration"]-test_features_df["meter_waiting"]


## Speed

In [0]:
features_df['speed'] = features_df['distance']/features_df["effective_time"]
test_features_df['speed'] = test_features_df['distance']/test_features_df["effective_time"]

## Fare

In [0]:
features_df['meter_waiting_fare_diff'] = features_df['meter_waiting_fare'] - features_df['meter_waiting']*meter_waiting_rate_mean
features_df['effective_duration'] = features_df['duration'] - features_df['meter_waiting']
features_df['fare_mean'] = features_df['effective_fare']/features_df['effective_duration']
features_df['calculated_fare'] = features_df['distance']*effective_fare_rate + features_df['meter_waiting']*meter_waiting_rate_mean + additional_fare_mean

test_features_df['meter_waiting_fare_diff'] = test_features_df['meter_waiting_fare'] - test_features_df['meter_waiting']*meter_waiting_rate_mean
test_features_df['effective_duration'] = test_features_df['duration'] - test_features_df['meter_waiting']
test_features_df['fare_mean'] = test_features_df['effective_fare']/test_features_df['effective_duration']
test_features_df['calculated_fare'] = test_features_df['distance']*effective_fare_rate + test_features_df['meter_waiting']*meter_waiting_rate_mean + additional_fare_mean

## Seperate features/Label

In [0]:
y= features_df["label"]
y =y.replace(to_replace="correct",value=1)
y =y.replace(to_replace="incorrect",value=0)

X = features_df.drop(columns=["label"], axis=1)

In [114]:
# X.describe()
test_features_df.isnull().sum(axis = 0)

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
is_outlier_loc               0
distance                     0
effective_fare               0
fare_outlier                 0
pickup_time_hour             0
pickup_time_minute           0
drop_time_hour               0
drop_time_minute             0
pickup_time_day              0
drop_time_day                0
effective_time               0
speed                        1
meter_waiting_fare_diff      0
effective_duration           0
fare_mean                    1
calculated_fare              0
dtype: int64

## Set Categorical Variable

In [0]:
column_titles = []
for col in X.columns: 
    column_titles.append(col)

test_features_df=test_features_df.reindex(columns=column_titles)

In [0]:
categorical_var = ['is_outlier_loc','fare_outlier']
categorical_var_indices = [X.columns.get_loc(c) for c in categorical_var]

## Drop Columns

## Set Train/Test

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
print("Number of records in training data ",X_train.shape[0])
print("Number of records in validation data ",X_test.shape[0])

Number of records in training data  13737
Number of records in validation data  3435


# Classifiers


## Catboost

In [0]:
model = CatBoostClassifier(iterations=100000, 
                          #  cat_features=categorical_var_indices, 
                          #  task_type="GPU",
                           devices='0:1',
                          nan_mode='Max',
                          eval_metric = 'F1')
# model.fit(X_train,
#           y_train,
#           verbose=True,plot=False, eval_set = (X_test,y_test))

In [90]:
print(model.get_best_iteration())

None


In [0]:
predictions = model.predict(X_test)

predictions = [round(value) for value in predictions]
np.unique(predictions , return_counts=True) 

In [0]:
accuracy = accuracy_score(y_test, predictions)
f1_macro = f1_score(y_test, predictions, average='macro')
f1_binary = f1_score(y_test, predictions, average='binary')
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Macro F1: %.2f%%" % (f1_macro * 100.0))
print("Binary F1: %.2f%%" % (f1_binary * 100.0))

filled all null_1227 epochs
Accuracy: 96.13%
Macro F1: 88.09%
Binary F1: 97.87%

11_P_Original_100000ep_with_newest_features_removed_hardcoded
Accuracy: 96.22%
Macro F1: 88.44%
Binary F1: 97.92%
Subission score = 98.154%

9_P_Original_100000ep_with_newes_features shrink to best model
Accuracy: 96.22%
Macro F1: 88.44%
Binary F1: 97.92%

After fixing major issue 460/6000 epochs with cat
Accuracy: 96.16%
Macro F1: 87.84%
Binary F1: 97.90%

6_P_Original_100000Epochs_all_dataset_with_catVar_Submmit
Accuracy: 98.69%
Macro F1: 96.33%
Binary F1: 99.27%

Piyumal Model - 1000 epochs_Splited_data_with_cat_var
Accuracy: 96.10%
Macro F1: 87.54%
Binary F1: 97.87%

Piyumal Model - 100000 epochs(5290)_AllData
Accuracy: 98.92%
Macro F1: 96.67%
Binary F1: 99.41%

Piyumal Model - 100000 epochs(5290)
Accuracy: 96.42%
Macro F1: 88.61%
Binary F1: 98.04%
Submission_Score : 0.97863

Piyumal Model - 100000 epochs(5290)
Accuracy: 96.42%
Macro F1: 88.61%
Binary F1: 98.04%

Piyumal Model - 1000 epochs
Accuracy: 95.90%
Macro F1: 86.83%
Binary F1: 97.76%

Accuracy: 95.02%
Macro F1: 84.69%
Binary F1: 97.27%

Add fare rate
Accuracy: 95.87%
Macro F1: 86.15%
Binary F1: 97.75%

removed fare outlier
Accuracy: 94.97%
Macro F1: 83.32%
Binary F1: 97.26%

Accuracy: 95.14%
Macro F1: 84.35%
Binary F1: 97.34%

# Save Predictions 

In [119]:
model_new = model
model_new.fit(X,
          y,
          verbose=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
95001:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95002:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95003:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95004:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95005:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95006:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95007:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95008:	learn: 0.9931065	total: 20m 1s	remaining: 1m 3s
95009:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95010:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95011:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95012:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95013:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95014:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95015:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95016:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s
95017:	learn: 0.9931065	total: 20m 2s	remaining: 1m 3s


<catboost.core.CatBoostClassifier at 0x7f9e06927e10>

In [120]:
predictions_1 = model.predict(test_features_df)
np.unique(predictions_1 , return_counts=True)

(array([0, 1]), array([ 422, 8154]))

In [0]:
submission_df = pd.read_csv(folderPath+"predications_submission.csv", 
                            index_col="tripid")

np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

submission_df['prediction'] = predictions_1

submission_df.to_csv(folderPath+'predictions/new/13_all_null_filled_100000epochs.csv')