In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [3]:
train = pd.read_csv("../data/tables/agg_transaction_train_sdf.csv")
test = pd.read_csv("../data/tables/agg_transaction_pred_sdf.csv")

In [5]:
X_train = train.iloc[:,2:-1].astype(float)
y_train = train.iloc[:,-1].astype(float)
X_test = test.iloc[:,2:].astype(float)

In [8]:
X_train = X_train.fillna(0)
y_train = y_train.fillna(0)
X_test = X_test.fillna(0)

In [9]:
X_test

Unnamed: 0,distinct_monthly_consumer_count,monthly_order_count,monthly_revenue,dollar_value_standard_dev,dollar_per_order,median_dollar_value
0,2943.0,4968.0,327429.385720,65.839486,65.907686,45.969011
1,3.0,4.0,16538.731756,2543.507988,4134.682939,2883.341620
2,38.0,58.0,55589.869552,637.118388,958.446027,861.687281
3,8.0,12.0,105269.442537,7913.006782,8772.453545,3896.208341
4,67.0,99.0,771700.751401,5544.280890,7794.957085,5929.815410
...,...,...,...,...,...,...
388,15.0,24.0,65846.563525,2030.061109,2743.606814,1759.919108
389,390.0,620.0,889802.323859,748.648506,1435.165038,1437.415746
390,3.0,6.0,83590.948032,5618.316112,13931.824672,16752.205962
391,62.0,102.0,898497.851369,6310.198484,8808.802464,6671.890638


In [31]:
X_test.describe()

Unnamed: 0,distinct_monthly_consumer_count,monthly_order_count,monthly_revenue,dollar_value_standard_dev,dollar_per_order
count,393.0,393.0,393.0,393.0,393.0
mean,743.643766,1727.201018,604100.3,2319.011587,4018.232898
std,2466.772975,8731.182781,1247298.0,2995.827738,5848.576287
min,1.0,1.0,9288.467,0.0,20.361114
25%,14.0,22.0,48665.03,321.028255,633.389481
50%,60.0,98.0,178054.1,1246.689338,1918.93399
75%,342.0,525.0,819906.3,2292.172393,4187.301283
max,23473.0,136179.0,10192090.0,15942.146001,56552.394443


In [10]:
X_test.isnull().count()

distinct_monthly_consumer_count    393
monthly_order_count                393
monthly_revenue                    393
dollar_value_standard_dev          393
dollar_per_order                   393
median_dollar_value                393
dtype: int64

In [11]:
rf_param_group = {
    "n_estimators": (50,60),
    "max_depth": range(10, 80, 10),
#    "min_samples_leaf": range(1, 10, 1),
#    "min_samples_split": range(2, 10, 1),
    "max_features": (0.5,0.75),
    "max_samples" :(0.5,0.75),
    "ccp_alpha" :(0,0.01,0.001,1,10)
}

rf_estimator = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf_estimator, param_grid=rf_param_group, cv=5, n_jobs=-1)

In [12]:
grid_search.fit(X_train, y_train)
print("Best params: ", grid_search.best_params_)

Best params:  {'ccp_alpha': 10, 'max_depth': 70, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 50}


In [13]:
clf_rf = RandomForestRegressor(n_estimators=50, 
                                max_depth=70, max_features=0.5,
                                ccp_alpha = 10, max_samples = 0.5).fit(X_train, y_train)

In [14]:
from sklearn import metrics

rf_train_predict = clf_rf.predict(X_train)
print('Mean Absolute Error For Training Set:{:.3f}'.format(metrics.mean_absolute_error(y_train, rf_train_predict)))
print('Mean Squared Error For Training Set:{:.3f}'.format(metrics.mean_squared_error(y_train, rf_train_predict)))
print('Root Mean Squared Error For Training Set:{:.3f}'.format(np.sqrt(metrics.mean_squared_error(y_train, rf_train_predict))))

Mean Absolute Error For Training Set:1.545
Mean Squared Error For Training Set:3.177
Root Mean Squared Error For Training Set:1.782
