In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline 
import matplotlib.pyplot as plt

In [None]:
types = {'fare_amount': 'float32',
         'pickup_longitude': 'float64',
         'pickup_latitude': 'float64',
         'dropoff_longitude': 'float64',
         'dropoff_latitude': 'float64',
         'passenger_count': 'uint8'}
cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
train_data = pd.read_csv('../input/train.csv', nrows=100000, dtype=types, usecols=cols, infer_datetime_format=True, parse_dates=["pickup_datetime"]) # total nrows = 55423855
#test_data = pd.read_csv('../input/test.csv', nrows=0)
train_data.head()


In [None]:
print(train_data.shape)
#handling missing data
train_data.dropna(inplace=True)
# removing outliers
train_data.drop(train_data.loc[(train_data.fare_amount<=0) | (train_data.fare_amount>60)].index, inplace=True)
train_data.drop(train_data.loc[(train_data.pickup_longitude<-75) | (train_data.pickup_longitude>-73)].index, inplace=True)
train_data.drop(train_data.loc[(train_data.dropoff_longitude<-75) | (train_data.dropoff_longitude>-73)].index, inplace=True)
train_data.drop(train_data.loc[(train_data.pickup_latitude<40) | (train_data.pickup_latitude>41.5)].index, inplace=True)
train_data.drop(train_data.loc[(train_data.dropoff_latitude<40) | (train_data.dropoff_latitude>41.5)].index, inplace=True)
train_data.drop(train_data.loc[(train_data.passenger_count>7)].index, inplace=True)

print(train_data.shape)

In [None]:
# train test split
from sklearn.model_selection import train_test_split
train_data, test_set = train_test_split(train_data, test_size=0.2, random_state=23)

In [None]:
# Transformation pipeline
from sklearn.base import BaseEstimator, TransformerMixin
timeix, lat1ix, lon1ix, lat2ix, lon2ix = 0, 2, 1, 4, 3


class ModifyAttributes(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def distance(self, lat1, lon1, lat2, lon2):
        p = 0.017453292519943295 # Pi/180
        a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
        return 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
    #Feature engineering
        distance_km = [self.distance(X[_, lat1ix], X[_, lon1ix], X[_, lat2ix], X[_, lon2ix]) for _ in range(X.shape[0])]
        year = [_.year for _ in X[:, timeix]]
        month = [_.month for _ in X[:, timeix]]
        hour = [_.hour for _ in X[:, timeix]]
        weekday = [_.weekday() for _ in X[:, timeix]]
    #     train_data['is_weekend'] = np.where(train_data.weekday<5, 0, 1)
    #     train_data['is_rush_hour'] = np.where((train_data.hour<8)  | (train_data.hour > 18), 0, 1)
        return np.c_[X[:, 1:], distance_km, year, month, hour, weekday]

#train_data.drop(train_data.loc[(train_data.distance_km<=0.1)].index, inplace=True)


In [None]:
train_data_labels = train_data.fare_amount.copy()
train_data = train_data.drop('fare_amount', axis=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('attribs_adder', ModifyAttributes()),
    ('std_scaler', StandardScaler())
])

final_cols = list(train_data.columns[1:].values) + ['distance_km', 'year', 'month', 'hour', 'weekday']

train_data_tr = pipeline.fit_transform(train_data.values)
train_data_tr = pd.DataFrame(train_data_tr, columns=final_cols)
train_data_tr.head()

# Model

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('STD:', scores.std())

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_data_tr, train_data_labels)
lin_scores = cross_val_score(lin_reg, train_data_tr, train_data_labels, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
# Decision Tree model
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_data_tr, train_data_labels)
tree_scores = cross_val_score(tree_reg, train_data_tr, train_data_labels, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

In [None]:
# Random Forest model
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_data_tr, train_data_labels)
forest_scores = cross_val_score(forest_reg, train_data_tr, train_data_labels, scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
# Fine-tune random forest
# from sklearn.model_selection import GridSearchCV
# param_grid = [
#     {'n_estimators': [3, 10, 30, 100], 'max_features': [2, 4, 6, 8]}
# ]
# grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(train_data_tr, train_data_labels)
# grid_search.best_params_

In [None]:
# cvres = grid_search.cv_results_
# for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
#     print(np.sqrt(-mean_score), params)

In [None]:
best_model = forest_reg
# best_model = grid_search.best_estimator_

# Evaluate on the Test Set

In [None]:
X_test = test_set.drop(['fare_amount'], axis=1)
y_test = test_set.fare_amount.values
X_test_prepared = pipeline.transform(X_test.values)
final_predictions = best_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

# Launch! (predict test.csv)

In [None]:
test_csv = pd.read_csv('../input/test.csv', dtype=types, infer_datetime_format=True, parse_dates=["pickup_datetime"])
X_test_csv_keys = test_csv.key.copy()
X_test_csv = test_csv.drop('key', axis=1)
X_test_csv_prepared = pipeline.transform(X_test_csv.values)
final_predictions_csv = best_model.predict(X_test_csv_prepared)
submission = pd.DataFrame(data={'key':X_test_csv_keys, 'fare_amount': final_predictions_csv})
submission.to_csv('submission_1_with_100000training_wo_fine_tuning.csv', index=None)