In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
df_train = pd.read_csv("CleanTraing.csv", parse_dates=["date", "visitStartTime"], dtype={'fullVisitorId': 'str',
                                                                                        'visitId': 'int64',
                                                                                        'visitNumber': 'int64',
                                                                                        'totals_transactionRevenue': 'float64'})
df_test = pd.read_csv("CleanTest.csv", parse_dates=["date", "visitStartTime"], dtype={'fullVisitorId': 'str',
                                                                                     'visitId': 'int64',
                                                                                     'visitNumber': 'int64'})

In [16]:
to_categorical = ['channelGrouping',
 'device_browser',
 'device_operatingSystem',
 'device_isMobile',
 'device_deviceCategory',
 'geoNetwork_continent',
 'geoNetwork_subContinent',
 'trafficSource_campaign',
 'trafficSource_medium',
 'trafficSource_adwordsClickInfo_page',
 'trafficSource_adwordsClickInfo_slot',
 'trafficSource_adwordsClickInfo_adNetworkType',
 'trafficSource_adwordsClickInfo_isVideoAd']
df_train[to_categorical] = df_train[to_categorical].astype('category')
df_test[to_categorical] = df_test[to_categorical].astype('category')

In [None]:
# list(df_train.columns[df_train.columns.dtype != 'O'])

In [None]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df_train, columns = ['channelGrouping', 'date', 'fullVisitorId',
                                                   'visitStartTime', 'device_browser', 'device_operatingSystem',
                                                   'device_isMobile', 'device_deviceCategory', 'geoNetwork_continent',
                                                   'geoNetwork_subContinent', 'geoNetwork_country', 'geoNetwork_region',
                                                   'geoNetwork_metro', 'geoNetwork_city', 'geoNetwork_networkDomain',
                                                   'totals_hits', 'totals_pageviews', 
                                                   'trafficSource_referralPath', 'trafficSource_campaign',
                                                   'trafficSource_source', 'trafficSource_medium',
                                                   'trafficSource_adwordsClickInfo_page',
                                                   'trafficSource_adwordsClickInfo_slot',
                                                   'trafficSource_adwordsClickInfo_gclId',
                                                   'trafficSource_adwordsClickInfo_adNetworkType',
                                                   'trafficSource_adwordsClickInfo_isVideoAd'])
del features_df['totals_transactionRevenue']

X = features_df.as_matrix()
y = df_train['totals_transactionRevenue'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create the model
model = ensemble.GradientBoostingRegressor()

# Parameters we want to try
param_grid = {
    'n_estimators': [10, 12],
    'max_depth': [14, 16],
    'min_samples_leaf': [19, 27],
    'learning_rate': [0.1, 0.01],
    'max_features': [0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

# Define the grid search we want to run. Run it with four cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

# After running a .....long..... time, the output will be something like
# {'loss': 'huber', 'learning_rate': 0.1, 'min_samples_leaf': 9, 'n_estimators': 3000, 'max_features': 0.1, 'max_depth': 6}

# That is the combination that worked best.

# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)



In [31]:
features_df.head()

Unnamed: 0,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,date_1970-01-01 00:00:00.020160801,date_1970-01-01 00:00:00.020160802,...,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIhcS6hsvK1AIVzLbACh0B_wAqEAAYASAAEgJv2fD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIjJ7P87HX1AIVUm5-Ch1BoAjcEAAYASAAEgKoM_D_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIlKyoocis1QIVhGt-Ch23UwC5EAAYASAAEgLjZvD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIsbbo-s6F1QIVxV5-Ch0eWgx5EAAYASAAEgJntfD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIsci43t310wIV0oR-Ch3uRQnWEAAYASAAEgKaivD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMItKHZw8na1AIVD1p-Ch0hUwdkEAAYASAAEgIqcfD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIuurV2oC61AIVDH9-Ch3xUgdlEAAYASAAEgJebfD_BwE,trafficSource_adwordsClickInfo_gclId_EAIaIQobChMIw5iP5ri71AIVFz2BCh0twgrgEAAYASAAEgIhDvD_BwE,trafficSource_adwordsClickInfo_adNetworkType_Google Search,trafficSource_adwordsClickInfo_isVideoAd_0.0
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
list(df_train.columns[df_train.columns.dtype == 'object'][0])

['channelGrouping',
 'date',
 'fullVisitorId',
 'visitId',
 'visitNumber',
 'visitStartTime',
 'device_browser',
 'device_operatingSystem',
 'device_isMobile',
 'device_deviceCategory',
 'geoNetwork_continent',
 'geoNetwork_subContinent',
 'geoNetwork_country',
 'geoNetwork_region',
 'geoNetwork_metro',
 'geoNetwork_city',
 'geoNetwork_networkDomain',
 'totals_hits',
 'totals_pageviews',
 'totals_transactionRevenue',
 'trafficSource_referralPath',
 'trafficSource_campaign',
 'trafficSource_source',
 'trafficSource_medium',
 'trafficSource_adwordsClickInfo_page',
 'trafficSource_adwordsClickInfo_slot',
 'trafficSource_adwordsClickInfo_gclId',
 'trafficSource_adwordsClickInfo_adNetworkType',
 'trafficSource_adwordsClickInfo_isVideoAd']

In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11515 entries, 0 to 11514
Data columns (total 29 columns):
channelGrouping                                 11515 non-null category
date                                            11515 non-null datetime64[ns]
fullVisitorId                                   11515 non-null object
visitId                                         11515 non-null int64
visitNumber                                     11515 non-null int64
visitStartTime                                  11515 non-null datetime64[ns]
device_browser                                  11515 non-null category
device_operatingSystem                          11515 non-null category
device_isMobile                                 11515 non-null category
device_deviceCategory                           11515 non-null category
geoNetwork_continent                            11515 non-null category
geoNetwork_subContinent                         11515 non-null category
geoNetwork_country     