In [95]:
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

df = pd.read_csv('/home/muaz/Desktop/ML_assignment/train.csv')
df.head(5)

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [96]:
df = df.drop(['tripid', 'pickup_time', 'drop_time', 'meter_waiting'], axis=1)
df.head(5)

Unnamed: 0,additional_fare,duration,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,10.5,834.0,0.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32,correct
1,10.5,791.0,0.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,10.5,1087.0,0.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,10.5,598.0,15.6638,68.0,6.9257,79.8895,6.92748,79.8971,82.3,correct
4,,,,,6.87441,79.8615,6.84478,79.929,358.39,correct


In [97]:
def calculate_displacement(row):
    x1,y1,x2,y2 = row['pick_lat'],row['pick_lon'],row['drop_lat'],row['drop_lon']
    return sqrt((x1-x2)**2 + (y1-y2)**2)

dis = df.apply(lambda row: calculate_displacement(row), axis=1)
df.insert(5, 'distance', dis)
df = df.drop(['pick_lat','pick_lon','drop_lat','drop_lon'], axis=1)
df.head(5)

Unnamed: 0,additional_fare,duration,meter_waiting_fare,meter_waiting_till_pickup,distance,fare,label
0,10.5,834.0,0.0,64.0,0.045869,270.32,correct
1,10.5,791.0,0.0,134.0,0.0285,197.85,correct
2,10.5,1087.0,0.0,61.0,0.057019,301.64,correct
3,10.5,598.0,15.6638,68.0,0.007806,82.3,correct
4,,,,,0.073717,358.39,correct


In [98]:
df = df.replace({'label' :{'correct':1, 'incorrect':0}})
df.head(5)

Unnamed: 0,additional_fare,duration,meter_waiting_fare,meter_waiting_till_pickup,distance,fare,label
0,10.5,834.0,0.0,64.0,0.045869,270.32,1
1,10.5,791.0,0.0,134.0,0.0285,197.85,1
2,10.5,1087.0,0.0,61.0,0.057019,301.64,1
3,10.5,598.0,15.6638,68.0,0.007806,82.3,1
4,,,,,0.073717,358.39,1


In [99]:
df = df.fillna(df.mean())
df.head(5)

Unnamed: 0,additional_fare,duration,meter_waiting_fare,meter_waiting_till_pickup,distance,fare,label
0,10.5,834.0,0.0,64.0,0.045869,270.32,1
1,10.5,791.0,0.0,134.0,0.0285,197.85,1
2,10.5,1087.0,0.0,61.0,0.057019,301.64,1
3,10.5,598.0,15.6638,68.0,0.007806,82.3,1
4,13.719651,1702.858077,32.057666,112.466832,0.073717,358.39,1


In [100]:
test_train_ratio = 0.2
features = df.loc[:,'additional_fare':'fare']
labels = df.loc[:,'label']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_train_ratio)

# from imblearn.over_sampling import SMOTE, ADASYN

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# X_resampled, y_resampled = X_train, y_train
from collections import Counter
print(sorted(Counter(y_resampled).items()))


model = RandomForestClassifier().fit(X_resampled, y_resampled)

train_score = model.score(X_resampled, y_resampled)

y_pred = model.predict(X_test)
test_score = model.score(X_test, y_pred)

print(y_test.dtype, y_pred.dtype)
f1_macro_score = f1_score(y_test, y_pred, average='macro')
print('Accuracy of training set: {:.2f}'.format(train_score))
print('Accuracy of test set: {:.2f}'.format(test_score))
print('F1 Score: {:.2f}'.format(f1_macro_score))

[(0, 12371), (1, 12371)]
int64 int64
Accuracy of training set: 1.00
Accuracy of test set: 1.00
F1 Score: 0.83


In [101]:
test_df = pd.read_csv('/home/muaz/Desktop/ML_assignment/test.csv')

tripid = test_df['tripid']
test_df = test_df.drop(['tripid', 'pickup_time', 'drop_time', 'meter_waiting'], axis=1)
dis = test_df.apply(lambda row: calculate_displacement(row), axis=1)
test_df.insert(5, 'distance', dis)
test_df = test_df.drop(['pick_lat','pick_lon','drop_lat','drop_lon'], axis=1)
test_df = test_df.fillna(test_df.mean())
df.head(5)

Unnamed: 0,additional_fare,duration,meter_waiting_fare,meter_waiting_till_pickup,distance,fare,label
0,10.5,834.0,0.0,64.0,0.045869,270.32,1
1,10.5,791.0,0.0,134.0,0.0285,197.85,1
2,10.5,1087.0,0.0,61.0,0.057019,301.64,1
3,10.5,598.0,15.6638,68.0,0.007806,82.3,1
4,13.719651,1702.858077,32.057666,112.466832,0.073717,358.39,1


In [102]:
test_result = model.predict(test_df)

In [104]:
out_df = pd.DataFrame()
out_df['tripid'] = tripid
out_df['prediction'] = test_result
out_df.to_csv('/home/muaz/Desktop/ML_assignment/prediction.csv', index=False)
out_df.head(5)

Unnamed: 0,tripid,prediction
0,213284604,1
1,213286352,0
2,213293973,1
3,213294622,1
4,213298687,1


In [94]:
df['label'].value_counts()

1    15495
0     1681
Name: label, dtype: int64