In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/training.csv', sep=',', parse_dates=['trip_start_timestamp', 'trip_end_timestamp'])

# clean the dataset
def clean_dataset(x):
    # drop data containing NaN
    x = x.dropna(subset=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'])
    # payment_type -> save "Cash" or "Credit Card" only
    x = x[(x['payment_type'] == 'Cash') | (x['payment_type'] == 'Credit Card')]
    # filter "fare"
    x = x[x['fare']<=80]
    # filter "tips"
    x = x[x['tips']<=16]
    return x

# data preprocessing
def preprocess(df):
    # Fill payment with Cash
    df = df.fillna({'payment_type':'Cash'})
    df_transform = StandardScaler().fit_transform(
        df[['pickup_pca0','pickup_pca1','dropoff_pca0','dropoff_pca1','distance','pickup_cluster','dropoff_cluster']])
    df[['pickup_pca0','pickup_pca1','dropoff_pca0','dropoff_pca1','distance','pickup_cluster',
        'dropoff_cluster']] = df_transform
    # one-hot encoding
    payment_type_arr = ['Cash', 'Credit Card']
    weekday_arr = ['1', '2', '3', '4', '5', '6', '7']
    hour_arr = [str(i) for i in range(0, 24)]
    for i in payment_type_arr:
        class_str = 'payment_type_%s' % i
        df[class_str] = df['payment_type'].apply(lambda x: 1 if x == i else 0)
    for i in weekday_arr:
        class_str = 'weekday_%s' % i
        df[class_str] = df['weekday'].apply(lambda x: 1 if x == i else 0)
    for i in hour_arr:
        class_str = 'hour_%s' % i
        df[class_str] = df['hour'].apply(lambda x: 1 if x == i else 0)

    df.fillna(0, inplace=True)
    return df

# clean data
df = clean_dataset(df)

# PCA process latitude and longitude
from sklearn.decomposition import PCA
coords = np.vstack((df[['pickup_latitude', 'pickup_longitude']].values,
                    df[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
df['pickup_pca0'] = pca.transform(df[['pickup_latitude', 'pickup_longitude']])[:, 0]
df['pickup_pca1'] = pca.transform(df[['pickup_latitude', 'pickup_longitude']])[:, 1]
df['dropoff_pca0'] = pca.transform(df[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
df['dropoff_pca1'] = pca.transform(df[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

# calculate distance
df['distance'] = np.sqrt(np.square(np.abs(df['pickup_latitude'] - df['dropoff_latitude'])) +
                 np.square(np.abs(df['pickup_longitude'] - df['dropoff_longitude'])))

# cluster locations
from sklearn.cluster import MiniBatchKMeans
coords = np.vstack((df[['pickup_latitude', 'pickup_longitude']].values,
                    df[['dropoff_latitude', 'dropoff_longitude']].values))
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords)
df.loc[:, 'pickup_cluster'] = kmeans.predict(df[['pickup_latitude', 'pickup_longitude']])
df.loc[:, 'dropoff_cluster'] = kmeans.predict(df[['dropoff_latitude', 'dropoff_longitude']])

# add features of date
df['weekday'] = df['trip_start_timestamp'].dt.dayofweek
df['hour'] = df['trip_start_timestamp'].dt.hour

# preprocess
df = preprocess(df)

features = ['pickup_pca0','pickup_pca1','dropoff_pca0','dropoff_pca1','payment_type_Cash','payment_type_Credit Card',
            'distance', 'pickup_cluster', 'dropoff_cluster',
            'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6','weekday_7',
            'hour_0','hour_1','hour_2','hour_3','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11',
            'hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21',
            'hour_22','hour_23']

# separate dataset
X_train, X_test, y_train, y_test = train_test_split(df[features], df['tips'], test_size=0.2,
                                                    shuffle=True,
                                                    random_state=1)

# try different regression methods
def try_different_model(model, x_train,y_train, x_vali,y_vali, model_name):
    model.fit(x_train, y_train)     # training process
    result = model.predict(x_vali)  # predicting process
    score = model.score(x_vali, y_vali) # evaluation
    # visualization
    plt.figure()
    plt.plot(np.arange(len(result)), y_vali, 'go-', label='true value')
    plt.plot(np.arange(len(result)), result, 'ro-', label='predict value')
    plt.title(model_name + '  score: %f' % score)
    plt.legend()
    plt.show()
    # self evaluation
    self_score = model.score(x_train, y_train)
    return [score, self_score]


# compare and select the best regression model
def model_selection(x_train, y_train, x_vali, y_vali):
    '''
        Regression Model:
        Singular model: Decision tree, KNN
        Assemble model: random forest, Adaboost, GradientBoosting, Bagging, ExtraTrees
    '''
    score_list = []
    # Decision Tree
    from sklearn import tree
    model_DecisionTreeRegressor = tree.DecisionTreeRegressor()
    score_list.append(try_different_model(model_DecisionTreeRegressor, x_train, y_train, x_vali, y_vali, 'DT'))

    # KNN
    from sklearn import neighbors
    model_KNeighborsRegressor = neighbors.KNeighborsRegressor(n_neighbors=10)
    score_list.append(try_different_model(model_KNeighborsRegressor, x_train, y_train, x_vali, y_vali, 'KNN'))

    # random forest
    from sklearn import ensemble
    model_RandomForestRegressor = ensemble.RandomForestRegressor(n_estimators=20)  # 20 trees
    score_list.append(
        try_different_model(model_RandomForestRegressor, x_train, y_train, x_vali, y_vali, 'RandomForest'))

    # Adaboost
    from sklearn import ensemble
    model_AdaBoostRegressor = ensemble.AdaBoostRegressor(n_estimators=50)  # 50 trees
    score_list.append(try_different_model(model_AdaBoostRegressor, x_train, y_train, x_vali, y_vali, 'Adaboost'))

    # GBRT
    from sklearn import ensemble
    model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(n_estimators=100)  # 100 trees
    score_list.append(try_different_model(model_GradientBoostingRegressor, x_train, y_train, x_vali, y_vali, 'GBRT'))

    # Bagging
    from sklearn.ensemble import BaggingRegressor
    model_BaggingRegressor = BaggingRegressor()
    score_list.append(try_different_model(model_BaggingRegressor, x_train, y_train, x_vali, y_vali, 'Bagging'))

    # ExtraTree
    from sklearn.tree import ExtraTreeRegressor
    model_ExtraTreeRegressor = ExtraTreeRegressor()
    score_list.append(try_different_model(model_ExtraTreeRegressor, x_train, y_train, x_vali, y_vali, 'ExtraTree'))

    return np.argmax(np.array(score_list).mean())

# model selection by comparing scores
# model = model_selection(X_train, y_train, X_test, y_test)

from sklearn import ensemble
model = ensemble.RandomForestRegressor(n_estimators=20)  # 20 trees
model.fit(X_train, y_train)
print("Model Score:", model.score(X_test, y_test))

Model Score: 0.8314062800325408


In [89]:
#------------------------------------- prediction --------------------------------------
test_data = pd.read_csv('data/5_2_testing.csv', sep=',', parse_dates=['trip_start_timestamp'])

def preprocess_prediction(df):
    # Fill payment with Cash
    df = df.fillna({'payment_type':'Cash'})

    # Fill location with dropoff or pickup location, otherwise 0
    df['order1_dropoff_latitude'].fillna(df['order1_pickup_latitude'], inplace=True)
    df['order1_dropoff_longitude'].fillna(df['order1_pickup_longitude'], inplace=True)
    df['order2_dropoff_latitude'].fillna(df['order2_pickup_latitude'], inplace=True)
    df['order2_dropoff_longitude'].fillna(df['order2_pickup_longitude'], inplace=True)
    df['order3_dropoff_latitude'].fillna(df['order3_pickup_latitude'], inplace=True)
    df['order3_dropoff_longitude'].fillna(df['order3_pickup_longitude'], inplace=True)

    # PCA
    coords1 = np.vstack((df[['order1_pickup_latitude', 'order1_pickup_longitude']].values,
                        df[['order1_dropoff_latitude','order1_dropoff_longitude']].values))
    coords2 = np.vstack((
                        df[['order2_pickup_latitude', 'order2_pickup_longitude']].values,
                        df[['order2_dropoff_latitude', 'order2_dropoff_longitude']].values))
    coords3 = np.vstack((
                        df[['order3_pickup_latitude', 'order3_pickup_longitude']].values,
                        df[['order3_dropoff_latitude', 'order3_dropoff_longitude']].values,
                        ))
    pca1 = PCA().fit(coords1)
    pca2 = PCA().fit(coords2)
    pca3 = PCA().fit(coords3)
    df['order1_pickup_pca0'] = pca1.transform(df[['order1_pickup_latitude', 'order1_pickup_longitude']])[:, 0]
    df['order1_pickup_pca1'] = pca1.transform(df[['order1_pickup_latitude', 'order1_pickup_longitude']])[:, 1]
    df['order1_dropoff_pca0'] = pca1.transform(df[['order1_dropoff_latitude','order1_dropoff_longitude']])[:, 0]
    df['order1_dropoff_pca1'] = pca1.transform(df[['order1_dropoff_latitude','order1_dropoff_longitude']])[:, 1]
    df['order2_pickup_pca0'] = pca2.transform(df[['order2_pickup_latitude', 'order2_pickup_longitude']])[:, 0]
    df['order2_pickup_pca1'] = pca2.transform(df[['order2_pickup_latitude', 'order2_pickup_longitude']])[:, 1]
    df['order2_dropoff_pca0'] = pca2.transform(df[['order2_dropoff_latitude', 'order2_dropoff_longitude']])[:, 0]
    df['order2_dropoff_pca1'] = pca2.transform(df[['order2_dropoff_latitude', 'order2_dropoff_longitude']])[:, 1]
    df['order3_pickup_pca0'] = pca3.transform(df[['order3_pickup_latitude', 'order3_pickup_longitude']])[:, 0]
    df['order3_pickup_pca1'] = pca3.transform(df[['order3_pickup_latitude', 'order3_pickup_longitude']])[:, 1]
    df['order3_dropoff_pca0'] = pca3.transform(df[['order3_dropoff_latitude', 'order3_dropoff_longitude']])[:, 0]
    df['order3_dropoff_pca1'] = pca3.transform(df[['order3_dropoff_latitude', 'order3_dropoff_longitude']])[:, 1]

    # 计算距离
    df['order1_distance'] = np.sqrt(np.square(np.abs(df['order1_pickup_latitude'] - df['order1_dropoff_latitude'])) +
                             np.square(np.abs(df['order1_pickup_longitude'] - df['order1_dropoff_longitude'])))
    df['order2_distance'] = np.sqrt(np.square(np.abs(df['order2_pickup_latitude'] - df['order2_dropoff_latitude'])) +
                                    np.square(np.abs(df['order2_pickup_longitude'] - df['order2_dropoff_longitude'])))
    df['order3_distance'] = np.sqrt(np.square(np.abs(df['order3_pickup_latitude'] - df['order3_dropoff_latitude'])) +
                                    np.square(np.abs(df['order3_pickup_longitude'] - df['order3_dropoff_longitude'])))

    # 聚类
    df.loc[:, 'order1_pickup_cluster'] = kmeans.predict(df[['order1_pickup_latitude', 'order1_pickup_longitude']])
    df.loc[:, 'order1_dropoff_cluster'] = kmeans.predict(df[['order1_dropoff_latitude', 'order1_dropoff_longitude']])
    df.loc[:, 'order2_pickup_cluster'] = kmeans.predict(df[['order2_pickup_latitude', 'order2_pickup_longitude']])
    df.loc[:, 'order2_dropoff_cluster'] = kmeans.predict(df[['order2_dropoff_latitude', 'order2_dropoff_longitude']])
    df.loc[:, 'order3_pickup_cluster'] = kmeans.predict(df[['order3_pickup_latitude', 'order3_pickup_longitude']])
    df.loc[:, 'order3_dropoff_cluster'] = kmeans.predict(df[['order3_dropoff_latitude', 'order3_dropoff_longitude']])

    # 添加日期相关特征
    df['weekday'] = df['trip_start_timestamp'].dt.dayofweek
    df['hour'] = df['trip_start_timestamp'].dt.hour

    # df_transform = StandardScaler().fit_transform(
    #     df[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']])
    df_transform = StandardScaler().fit_transform(
        df[['order1_pickup_pca0','order1_pickup_pca1','order1_dropoff_pca0','order1_dropoff_pca1','order1_distance',
            'order1_pickup_cluster','order1_dropoff_cluster',
            'order2_pickup_pca0','order2_pickup_pca1','order2_dropoff_pca0','order2_dropoff_pca1','order2_distance',
            'order2_pickup_cluster','order2_dropoff_cluster',
            'order3_pickup_pca0','order3_pickup_pca1','order3_dropoff_pca0','order3_dropoff_pca1','order3_distance',
            'order3_pickup_cluster','order3_dropoff_cluster']])
    df[['order1_pickup_pca0','order1_pickup_pca1','order1_dropoff_pca0','order1_dropoff_pca1','order1_distance',
            'order1_pickup_cluster','order1_dropoff_cluster',
            'order2_pickup_pca0','order2_pickup_pca1','order2_dropoff_pca0','order2_dropoff_pca1','order2_distance',
            'order2_pickup_cluster','order2_dropoff_cluster',
            'order3_pickup_pca0','order3_pickup_pca1','order3_dropoff_pca0','order3_dropoff_pca1','order3_distance',
            'order3_pickup_cluster','order3_dropoff_cluster']] = df_transform

    # one-hot encoding
    payment_type_arr = ['Cash', 'Credit Card', 'No Charge', 'Dispute', 'Unknown', 'Pcard', 'Prcard']
    weekday_arr = ['1', '2', '3', '4', '5', '6', '7']
    hour_arr = [str(i) for i in range(0, 24)]

    for j in range(1, 4):
        for i in payment_type_arr:
            class_str = 'order%s_payment_type_%s' % (j, i)
            df_payment_str = 'order%s_payment_type' % (j)
            df[class_str] = df[df_payment_str].apply(lambda x: 1 if x == i else 0)
        for i in weekday_arr:
            class_str = 'order%s_weekday_%s' % (j, i)
            df[class_str] = df['weekday'].apply(lambda x: 1 if x == i else 0)
        for i in hour_arr:
            class_str = 'order%s_hour_%s' % (j, i)
            df[class_str] = df['hour'].apply(lambda x: 1 if x == i else 0)

    df.fillna(0, inplace=True)
    return df

test_df = preprocess_prediction(test_data)

# predict tips of each order
for i in range(1, 4):
    new_features = ['order%s_%s' % (i, x) for x in features]
    tip_str = 'order%s_tips' % (i)
    test_data[tip_str] = model.predict(test_df[new_features])

# get best result
test_data['best'] = np.argmax(test_data.loc[:,('order1_tips','order2_tips','order3_tips')].values,axis=1) + 1


In [90]:
test_data.head(10)

Unnamed: 0.1,Unnamed: 0,trip_start_timestamp,order1_pickup_latitude,order1_pickup_longitude,order2_pickup_latitude,order2_pickup_longitude,order3_pickup_latitude,order3_pickup_longitude,order1_dropoff_latitude,order1_dropoff_longitude,...,order1_fare,order1_tips,order1_payment_type,order2_fare,order2_tips,order2_payment_type,order3_fare,order3_tips,order3_payment_type,best
0,0,2016-02-04 19:00:00,686.0,500.0,395.0,408.0,225.0,6.0,173.0,203.0,...,10.25,2.443869,Credit Card,15.0,0.0,Cash,45.75,6.454641,Credit Card,3
1,1,2016-02-04 19:00:00,18.0,610.0,728.0,277.0,210.0,470.0,294.0,113.0,...,10.25,2.810523,Credit Card,4.5,1.720153,Credit Card,11.25,5.058077,Credit Card,3
2,2,2016-02-04 19:00:00,527.0,24.0,18.0,610.0,744.0,605.0,527.0,24.0,...,3.25,0.002,Cash,4.25,9.3e-05,Cash,9.5,2.282168,Credit Card,3
3,3,2016-02-04 19:00:00,419.0,615.0,411.0,545.0,411.0,545.0,225.0,6.0,...,40.5,0.0,Cash,9.0,0.0,Cash,9.75,2.835948,Credit Card,3
4,4,2016-02-04 19:00:00,18.0,610.0,688.0,206.0,660.0,120.0,355.0,333.0,...,19.5,0.0,Cash,7.75,0.0,Cash,6.25,0.0,Cash,1
5,5,2016-02-04 19:00:00,173.0,203.0,754.0,410.0,18.0,610.0,109.0,708.0,...,11.25,0.0,Cash,6.25,2.125773,Credit Card,15.75,2.009749,Credit Card,2
6,6,2016-02-04 19:00:00,167.0,754.0,294.0,113.0,161.0,649.0,744.0,605.0,...,6.0,0.0,Cash,4.5,0.0,Cash,7.0,0.0,Cash,1
7,7,2016-02-04 19:00:00,18.0,610.0,754.0,410.0,18.0,610.0,18.0,610.0,...,6.75,0.019888,Cash,7.25,3.154382,Credit Card,6.0,4.381294,Credit Card,3
8,8,2016-02-04 19:00:00,18.0,610.0,225.0,6.0,130.0,532.0,294.0,113.0,...,6.75,0.000203,Cash,42.0,6.641605,Credit Card,4.75,0.0,Cash,2
9,9,2016-02-04 19:00:00,158.0,270.0,419.0,615.0,45.0,163.0,604.0,668.0,...,19.25,2.610487,Credit Card,5.75,4.253394,Credit Card,9.75,0.0,Cash,2
