In [2]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import pandas as pd

In [3]:
data = pd.read_csv('data/cleaned_data.csv')
data = data.iloc[:, 1:]
data.head()

Unnamed: 0,acct_type,channels,delivery_method,fb_published,gts,has_analytics,has_logo,num_order,show_map,user_age,...,MXN,NZD,USD,0.0,1.0,3.0,4.0,ACH,CHECK,No payout_type listed
0,1,5,0.0,0,0.0,0,0,0,1,36,...,0,0,1,1,0,0,0,0,0,1
1,0,0,1.0,0,868.02,0,1,23,0,149,...,0,0,1,0,1,0,0,0,1,0
2,0,8,1.0,0,3500.0,0,0,19,0,214,...,0,0,1,0,1,0,0,0,1,0
3,0,6,1.0,0,1167.35,0,1,39,0,889,...,0,0,0,0,1,0,0,1,0,0
4,0,11,0.0,1,2313.15,0,0,30,0,35,...,0,0,1,1,0,0,0,0,1,0


In [4]:
y = data['acct_type']
X = data.drop('acct_type', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)

gbm_f1 = cross_val_score(gbm, X, y, scoring='f1', cv=5)
gbm_recall = cross_val_score(gbm, X, y, scoring='recall', cv=5)
gbm_precision = cross_val_score(gbm, X, y, scoring='precision', cv=5)
gbm_accuracy = cross_val_score(gbm, X, y, scoring='accuracy', cv=5)

scores = [gbm_f1, gbm_recall, gbm_precision, gbm_accuracy]

for scorer in scores:
    print(scorer.mean())

0.70334693710953
0.5575310276782469
0.9528612131092918
0.9356209643885887


### Grid Search of gradient boosted model

In [16]:
#recall as score
grid_params = {'min_samples_split': [3],
                'max_depth': [2,4,6],
                'n_estimators': [100,200],
                'max_features': [2,4,6],
                'min_samples_leaf': [2,4],
                'subsample': [0.5, 1.0]
                }

gbm = GradientBoostingClassifier()

gbm_fit = GridSearchCV(gbm, grid_params, scoring='recall')

gbm_fit.fit(X, y)

print(gbm_fit.best_params_)
print(gbm_fit.best_estimator_)
print(gbm_fit.best_score_)

{'max_depth': 6, 'max_features': 6, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 200, 'subsample': 1.0}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=6, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=2,
              min_samples_split=3, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
0.6023253604788866


In [14]:
#f1 as score
grid_params = {'min_samples_split': [2,3,4],
                'max_depth': [2,3],
                'n_estimators': [50,100,200,300],
                'max_features': [2,3,4,5],
                'min_samples_leaf': [1,2,3],
                'subsample': [0.4, 0.7, 1.0]
                }

gbm = GradientBoostingClassifier()

gbm_fit = GridSearchCV(gbm, grid_params, scoring='f1')

gbm_fit.fit(X, y)

print(gbm_fit.best_params_)
print(gbm_fit.best_estimator_)
print(gbm_fit.best_score_)

KeyboardInterrupt: 