In [None]:
# settings
global_num_of_clusters = 225
global_num_of_rows = 20_000_000

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
types = {'fare_amount': 'float16',
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32',
         'dropoff_longitude': 'float32',
         'dropoff_latitude': 'float32',
         'passenger_count': 'uint8'}
cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
def load_data(Nrows):
    return pd.read_csv('../input/train.csv', nrows=Nrows, dtype=types, usecols=cols, infer_datetime_format=True, parse_dates=["pickup_datetime"]), pd.read_csv('../input/test.csv', infer_datetime_format=True, parse_dates=["pickup_datetime"])


In [None]:
# Handling missing data and outliers; for feature engineering refer to the Pipe_line
def clean_data(train_data):
    print('shape of before FE: ', train_data.shape)
    #handling missing data
    train_data.dropna(inplace=True)
    # removing outliers
    train_data.drop(train_data.loc[(train_data.fare_amount<=0) | (train_data.fare_amount>150)].index, inplace=True)
    train_data.drop(train_data.loc[(train_data.pickup_longitude<-74.03) | (train_data.pickup_longitude>-73.75)].index, inplace=True)
    train_data.drop(train_data.loc[(train_data.dropoff_longitude<-74.03) | (train_data.dropoff_longitude>-73.75)].index, inplace=True)
    train_data.drop(train_data.loc[(train_data.pickup_latitude<40.63) | (train_data.pickup_latitude>40.85)].index, inplace=True)
    train_data.drop(train_data.loc[(train_data.dropoff_latitude<40.63) | (train_data.dropoff_latitude>40.85)].index, inplace=True)
    train_data.drop(train_data.loc[(train_data.passenger_count>8)].index, inplace=True)

    print('shape of after FE: ', train_data.shape)
    return train_data

In [None]:
train_data, test_data = load_data(global_num_of_rows)
train_data = clean_data(train_data)
print(test_data.shape)

In [None]:
def dist(pickup_lat, pickup_long, dropoff_lat, dropoff_long):  
    distance = np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)
    
    return distance
    
def transform_data(df):
    df['distance'] = abs(df['pickup_longitude'] - df['dropoff_longitude']) + abs(df['pickup_latitude'] - df['dropoff_latitude'])
    
    df['is_hov'] = 0
    df.loc[df.loc[:, 'passenger_count'] > 4, 'is_hov'] = 1
    
    df['week_of_year'] = df.pickup_datetime.dt.weekofyear
    df['hour'] = df.pickup_datetime.dt.hour
    df['week'] = df.pickup_datetime.dt.week
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df['day_of_year'] = df.pickup_datetime.dt.dayofyear
    df["weekday"] = df.pickup_datetime.dt.weekday
    df["quarter"] = df.pickup_datetime.dt.quarter
    df["day_of_month"] = df.pickup_datetime.dt.day
    df['week_hour'] = df['weekday'] * 24 + df['hour']
    return df

def airport_feats(train,test_df):
    for data in [train,test_df]:
        nyc = (-74.0063889, 40.7141667)
        jfk = (-73.7822222222, 40.6441666667)
        ewr = (-74.175, 40.69)
        lgr = (-73.87, 40.77)
        data['distance_to_center'] = dist(nyc[1], nyc[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
        data['pickup_distance_to_jfk'] = dist(jfk[1], jfk[0],
                                             data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_jfk'] = dist(jfk[1], jfk[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_ewr'] = dist(ewr[1], ewr[0], 
                                              data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_ewr'] = dist(ewr[1], ewr[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_lgr'] = dist(lgr[1], lgr[0],
                                              data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_lgr'] = dist(lgr[1], lgr[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
    return train, test_df

train_data = transform_data(train_data)
test_data = transform_data(test_data)

#train_data, test_data = airport_feats(train_data,test_data)

train_data.head(10)
print(test_data.shape)


In [None]:
# Clustering
def clustering(train_data, nrows):
    from sklearn.cluster import MiniBatchKMeans
    coords = np.vstack((train_data[['pickup_latitude', 'pickup_longitude']].values,
                        train_data[['dropoff_latitude', 'dropoff_longitude']].values))
    kmeans = MiniBatchKMeans(n_clusters=global_num_of_clusters).fit(coords)
    return kmeans

kmeans = clustering(train_data, train_data.shape[0])
train_data['pickup_cluster'] = kmeans.predict(train_data[['pickup_latitude', 'pickup_longitude']])
train_data['dropoff_cluster'] = kmeans.predict(train_data[['dropoff_latitude', 'dropoff_longitude']])
test_data['pickup_cluster'] = kmeans.predict(test_data[['pickup_latitude', 'pickup_longitude']])
test_data['dropoff_cluster'] = kmeans.predict(test_data[['dropoff_latitude', 'dropoff_longitude']])

median_fare = train_data.groupby(['pickup_cluster', 'dropoff_cluster']).median()['fare_amount'].to_frame('median_fare').reset_index()
mean_fare = train_data.groupby(['pickup_cluster', 'dropoff_cluster']).mean()['fare_amount'].to_frame('mean_fare').reset_index()

train_data = train_data.merge(median_fare, how='outer')
train_data = train_data.merge(mean_fare, how='outer')

print(test_data.shape)
test_data = test_data.merge(median_fare, on= ['pickup_cluster', 'dropoff_cluster'], how='outer')
print(test_data.shape)
test_data = test_data.merge(mean_fare, on= ['pickup_cluster', 'dropoff_cluster'], how='outer')
print(test_data.shape)
print(test_data.key.isnull().sum())
test_data = test_data[pd.notnull(test_data['key'])]
print(test_data.shape)
print(test_data.key.isnull().sum())


In [None]:
#kmeans.cluster_centers_

In [None]:
median_fare.head()


In [None]:
print('no. of rows with NaN median fare = {} out of {}'.format(len(test_data) - test_data.median_fare.count(), len(test_data)))
print('no. of rows with NaN mean fare = {} out of {}'.format(len(test_data) - test_data.mean_fare.count(), len(test_data)))


# Check which are the NaN values in median_fare and mean_fare
# test_index = test_data.loc[pd.isnull(test_data['median_fare']), 'median_fare'].index[0]
# print(train_data.pickup_cluster.unique())
# print(train_data.dropoff_cluster.unique())
# print(set(train_data.pickup_cluster.unique() + train_data.dropoff_cluster.unique()))

# print(test_data.pickup_cluster.unique())
# print(test_data.dropoff_cluster.unique())

# print(train_data.loc[(train_data['pickup_cluster'] == test_data.loc[test_index, 'pickup_cluster']) & (train_data['dropoff_cluster'] == test_data.loc[test_index, 'dropoff_cluster'])][['pickup_cluster', 'dropoff_cluster', 'median_fare']].head())
# print(test_data.loc[(test_data['pickup_cluster'] == test_data.loc[test_index, 'pickup_cluster']) & (test_data['dropoff_cluster'] == test_data.loc[test_index, 'dropoff_cluster'])][['pickup_cluster', 'dropoff_cluster', 'median_fare']].head())
test_data.head()

In [None]:
# fig, ax = plt.subplots(ncols=1, nrows=1)
# N = 100000
# ax.scatter(train_data.pickup_longitude.values[:N], train_data.pickup_latitude.values[:N], s=10, lw=0,
#            c=train_data.pickup_cluster[:N].values, cmap='tab20', alpha=0.2)
# ax.set_xlabel('Longitude')
# ax.set_ylabel('Latitude')
# plt.show()

In [None]:
def drop_axis(df):
    for col in ['pickup_cluster', 'dropoff_cluster', 'pickup_datetime']:
        df.drop(col, axis=1, inplace=True)
        
drop_axis(train_data)
drop_axis(test_data)
train_data.info()

In [None]:
train_data.to_csv('train_ready_to_model.csv', index=None)
test_data.to_csv('test_ready_to_model.csv', index=None)
print('Done.')