In [360]:
%pylab inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier

from sklearn import model_selection, datasets, metrics
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer as DV

from sklearn.tree import export_graphviz
from sklearn.utils import shuffle

from sklearn.feature_selection import RFE


# Градиентный бустинг
import xgboost as xgb

Populating the interactive namespace from numpy and matplotlib


In [361]:
%matplotlib inline

In [528]:
data = pd.read_csv('./kaggle/orange_small_churn_train_data.csv', sep = ',', header = 0, index_col=0)
test_data = pd.read_csv('./kaggle/orange_small_churn_test_data.csv', sep = ',', header = 0, index_col=0)

## Подготовка данных

In [529]:
# т.к. в последней строке тренировочных данных ответа нету, то удаляем её
train_data = data.iloc[:-1,:-1]
train_labels = data.iloc[:-1,-1:]

print train_data.shape
print train_labels.shape
print test_data.shape

(18298, 230)
(18298, 1)
(10000, 230)


In [530]:
# отделяем числовые и категориальные признаки и удаляем полностью пустые признаки
data_numb = train_data.iloc[:,0:190].dropna(axis=1, how='all')
data_categ = train_data.iloc[:,190:].dropna(axis=1, how='all')

In [531]:
columns_name_numb = data_numb.columns.to_list()

In [532]:
# заменим NONE на среднее значение колонки
# Посчитаем средние по колонкам
numeric_means = data_numb.mean(axis=0, skipna=True)

# Заполним пропущенные численные значения средними
data_numb = data_numb.fillna(numeric_means, axis=0)

In [533]:
# ВЕЩЕСТВЕННЫЕ признаки
# NOTE: сделаем скелинг численных признаков с D=1 (подробнее см. неделю 1)
# NOTE2: такая запись по созданию нового DataFrame нужна из-за тоог что при fit_transform слетают индексы и потом
# при контатенации у нас получается каша.
scaler = StandardScaler()
data_numb=pd.DataFrame(scaler.fit_transform(data_numb.values), index=data_numb.index, columns=data_numb.columns)

# меняем пустые на 0 (0 это среднее так как признаки масштабированы)
# data_numb.fillna(0, inplace=True)


# КАТЕГОРИАЛЬНЫЕ признаки
# заменим пустые значения на NA (будет как доп.признак)
data_categ = data_categ.fillna('NA').applymap(lambda s: str(s))

# удалим те колонки где больше 20 категорий и меньше 2 (эти колонки не информативны)
name_del = [name for name, var in data_categ.iteritems() if var.value_counts(dropna=True).shape[0] > 20 or var.value_counts(dropna=True).shape[0] < 2]
data_categ = data_categ.drop(labels=name_del, axis=1)

In [534]:
# ВАРИАНТ 1 кодируем категориальные признаки
# data_dummies = pd.get_dummies(data_categ)


# ВАРИАНТ 2 one-hote-encoder
encoder = DV(sparse = False)
data_dummies = encoder.fit_transform(data_categ.T.to_dict().values())

# тут хранятся имена новых колонок
col_names_categ=encoder.feature_names_

In [535]:
data_dummies.shape

(18298, 101)

In [536]:
# при ВАРИАНТЕ 1
# объединяем числовые признаки и закодированные категориальные
# train_data = pd.concat([data_numb, data_dummies], axis=1)
# train_data.fillna(0, inplace=True)


# при ВАРИАНТЕ 2
# train_data = np.hstack((data_numb.values, data_dummies))

# train_data.fillna(0, inplace=True)

In [537]:
print data_numb.shape
print data_categ.shape

print train_data.shape
print train_labels.shape

(18298, 174)
(18298, 20)
(18298, 230)
(18298, 1)


## Балансировка классов

In [538]:
# 1 соответствует классу отток, -1 - классу не отток
# как видим у нас оч большая расбалансировка классов, попробуем её уменьшить
print train_labels["labels"].value_counts()/train_labels.shape[0]

-1.0    0.924746
 1.0    0.075254
Name: labels, dtype: float64


In [539]:
# Попробуем отбалансировать классы, сгенеририрем отдельно индексы для вещественных и категориальных классов
# чтобы у нас получились как будто новые непохожие элементы.

cnt_el = 15000

# получаем только класс 1
only_class_1_numb = data_numb[train_labels['labels']==1]
only_class_1_categ = pd.DataFrame(data_dummies[train_labels['labels']==1])

# сгенерировали индексы значения которых будем дублировать (добавим 15к новых элементов класса 1)
ind_class_1_numb_to_add = np.random.randint(0, only_class_1_numb.shape[0]-1, size=cnt_el)
ind_class_1_categ_to_add = np.random.randint(0, only_class_1_categ.shape[0]-1, size=cnt_el)

# сопоставляем индексы и значения
X_train_to_add_numb = only_class_1_numb.iloc[ind_class_1_numb_to_add]
X_train_to_add_categ = only_class_1_categ.iloc[ind_class_1_categ_to_add]

new_data_class_1 = np.hstack((X_train_to_add_numb.values, X_train_to_add_categ))
new_labels_class_1 = pd.DataFrame(np.ones(cnt_el))

In [540]:
print new_data_class_1.shape
print new_labels_class_1.shape
print train_labels.shape

(15000, 275)
(15000, 1)
(18298, 1)


In [541]:
# названия новых колонок
columns_name = data_numb.columns.to_list() + encoder.feature_names_

# вещественные+категориальные признаки
train_data = np.hstack((data_numb.values, data_dummies))

# + новые строки класса 1
train_data = np.vstack((train_data, new_data_class_1))

# + новые ответы для класса 1
train_labels = np.vstack((train_labels, new_labels_class_1))

In [542]:
print train_data.shape
print train_labels.shape

(33298, 275)
(33298, 1)


In [543]:
# посмотрим на баланс классов
print len(train_labels[train_labels==1])
print len(train_labels[train_labels==-1])

16377
16921


In [544]:
# перемешаем данные
new_train_data = shuffle(np.hstack((train_data, train_labels)))

In [545]:
train_labels

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [ 1.],
       [ 1.],
       [ 1.]])

In [546]:
train_data = new_train_data[:,:-1]
train_labels = new_train_data[:,-1:]

In [547]:
print train_data.shape
print train_labels.shape

(33298, 275)
(33298, 1)


In [548]:
train_labels

array([[ 1.],
       [ 1.],
       [-1.],
       ...,
       [-1.],
       [ 1.],
       [-1.]])

## Поиск важных признаков

In [471]:
# попробуем обучить признаки чтобы понять какие признакие самые важные
select = RFE(GradientBoostingClassifier(n_estimators = 125, random_state=0, learning_rate=0.12, max_depth=6),
            n_features_to_select=50)

In [None]:
%%time
select.fit(train_data, train_labels)

In [521]:
select_xgb = RFE(xgb.XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=75, min_child_weight=2, seed=0),
            n_features_to_select=100)

In [522]:
%%time
select_xgb.fit(train_data, train_labels)

CPU times: user 50min 33s, sys: 14.6 s, total: 50min 47s
Wall time: 50min 52s


RFE(estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=75,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1),
  n_features_to_select=100, step=1, verbose=0)

In [527]:
select_xgb.n_features_to_select

100

In [523]:
select_xgb.support_

array([False, False, False, False,  True,  True,  True, False, False,
       False, False,  True, False,  True, False, False, False,  True,
       False, False,  True,  True, False,  True,  True, False,  True,
       False, False,  True, False, False,  True, False, False, False,
        True, False, False, False, False, False,  True,  True, False,
       False,  True, False, False, False, False, False, False,  True,
        True, False, False, False,  True, False, False,  True,  True,
        True, False,  True, False,  True, False,  True, False,  True,
       False,  True, False, False,  True, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False,  True, False,
        True,  True,  True, False, False, False,  True, False,  True,
       False, False, False,  True,  True,  True,  True, False, False,
       False, False, False,  True,  True,  True, False, False, False,
        True, False,

In [549]:
columns_top100 = [columns_name[ind] for ind, col in enumerate(select_xgb.support_) if col]
ind_top100= [ind for ind, col in enumerate(select_xgb.support_) if col]

In [550]:
train_data = train_data[:, ind_top100]

In [551]:
train_data.shape

(33298, 100)

## Градиентный бустинг деревьев

https://dyakonov.org/2017/06/09/%D0%B3%D1%80%D0%B0%D0%B4%D0%B8%D0%B5%D0%BD%D1%82%D0%BD%D1%8B%D0%B9-%D0%B1%D1%83%D1%81%D1%82%D0%B8%D0%BD%D0%B3/

In [282]:
# max_depth - максимальная глубина
# learning_rate - насколько сильно каждое дерево будет пытаться исправить ошибки предыдущих деревьев.
parameters_grid = {
    'n_estimators' : range(85, 130, 5),
    'learning_rate' : [0.08, 0.085, 0.09, 0.095 , 0.1, 0.105, 0.11, 0.115, 0.12],
    'max_depth': range(1, 7, 1)
}

# Будем использовать метод стратификации который делит соотношение классов в обучающей выборке на равное количество
skf = model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
classifier = GradientBoostingClassifier(random_state=0)
grid_gbc = model_selection.GridSearchCV(classifier, parameters_grid, scoring = 'roc_auc', cv = skf)

In [283]:
%%time
grid_rfc.fit(train_data, train_labels)

CPU times: user 5h 12min 59s, sys: 2.36 s, total: 5h 13min 1s
Wall time: 5h 13min 2s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [85, 90, 95, 100, 105, 110, 115, 120, 125], 'learning_rate': [0.08, 0.085, 0.09, 0.095, 0.1, 0.105, 0.11, 0.115, 0.12], 'max_depth': [1, 2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [284]:
grid_rfc.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.12, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=125,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [285]:
print grid_rfc.best_score_
print grid_rfc.best_params_

0.9573896065946136
{'n_estimators': 125, 'learning_rate': 0.12, 'max_depth': 6}


In [358]:
# вывести 5 лучших результатов
pd.DataFrame(grid_rfc.cv_results_).sort_values(by = ['mean_test_score'], ascending = [False]).head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,params,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
341,1.811287,0.110419,0.892488,0.902574,gini,7,15,1,120,"{u'max_features': 15, u'n_estimators': 120, u'...",...,0.889195,0.90524,0.891336,0.901799,0.891492,0.901468,0.234767,0.002839,0.002382,0.00141
340,2.25752,0.159539,0.892409,0.902353,gini,7,15,1,110,"{u'max_features': 15, u'n_estimators': 110, u'...",...,0.889159,0.905136,0.891037,0.901514,0.891418,0.901296,0.633057,0.042878,0.002425,0.00143
338,1.360242,0.158142,0.892244,0.902092,gini,7,15,1,90,"{u'max_features': 15, u'n_estimators': 90, u'c...",...,0.888494,0.904592,0.890898,0.901091,0.891957,0.901484,0.066677,0.002878,0.002472,0.001298
347,1.722359,0.115429,0.892205,0.902219,gini,7,15,2,120,"{u'max_features': 15, u'n_estimators': 120, u'...",...,0.890335,0.905915,0.891106,0.901407,0.891459,0.901505,0.10717,0.005763,0.001753,0.001951
339,1.300365,0.151131,0.892064,0.90207,gini,7,15,1,100,"{u'max_features': 15, u'n_estimators': 100, u'...",...,0.888496,0.904749,0.890673,0.901019,0.891347,0.901177,0.017586,0.014723,0.002504,0.001384


-----

In [552]:
# строим модель с оптимальными параметрами которые удалось подобрать
clf = GradientBoostingClassifier(n_estimators = 125, learning_rate=0.12, max_depth=6, loss = 'exponential')
clf.fit(train_data, train_labels)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.12, loss='exponential', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=125,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [553]:
# проверяем метрики на тренировочном наборе
actual_labels = clf.predict(train_data)
# actual_labels_proba = clf.predict_proba(train_data)

In [554]:
print "AUC_ROC =", metrics.roc_auc_score(train_labels, actual_labels)
print "accuracy =", clf.score(train_data, train_labels)
print "precision =", metrics.precision_score(train_labels, actual_labels)
print "recall =", metrics.recall_score(train_labels, actual_labels)
print "f1 =", metrics.f1_score(train_labels, actual_labels)

AUC_ROC = 0.9219837898688904
accuracy = 0.9222175506036399
precision = 0.932384118421878
recall = 0.9076753984246199
f1 = 0.9198638613861386


In [555]:
# AUC_ROC = 0.5021491001564492
# accuracy = 0.9250191277735271
# precision = 0.8571428571428571
# recall = 0.004357298474945534

In [556]:
# важность признаков
clf.feature_importances_

array([4.54284531e-05, 1.21862668e-02, 5.07090427e-04, 2.20298645e-02,
       1.20098555e-03, 5.88355047e-03, 1.92129214e-03, 3.45027991e-03,
       0.00000000e+00, 2.15564017e-02, 9.95565737e-04, 1.27381165e-03,
       1.05834310e-02, 1.18279009e-03, 4.03879505e-03, 1.60025400e-03,
       2.47491740e-02, 4.06139517e-04, 3.26227311e-03, 5.73597924e-04,
       4.29518114e-03, 1.45100370e-01, 2.04494731e-02, 7.03916762e-03,
       1.67933163e-03, 2.26691970e-02, 6.18175506e-03, 2.36787060e-03,
       1.62720015e-03, 1.01380409e-02, 4.09130512e-04, 4.08880464e-03,
       2.65768091e-03, 4.65833081e-03, 2.97450141e-02, 6.27821101e-05,
       9.93185935e-03, 5.49567259e-03, 3.80804477e-04, 1.32507280e-02,
       2.02990061e-01, 1.75778724e-03, 1.32369889e-02, 7.44742448e-03,
       6.07777986e-04, 8.85020208e-03, 4.22874211e-04, 3.23222214e-03,
       9.65514393e-03, 1.52618183e-02, 7.76517098e-03, 8.17391701e-03,
       6.07350642e-05, 1.55055269e-03, 3.96114116e-04, 5.59073274e-04,
      

## Тестовый набор

In [557]:
columns_top100

['Var5',
 'Var6',
 'Var7',
 'Var13',
 'Var16',
 'Var21',
 'Var24',
 'Var25',
 'Var27',
 'Var28',
 'Var30',
 'Var35',
 'Var38',
 'Var44',
 'Var51',
 'Var53',
 'Var57',
 'Var64',
 'Var65',
 'Var69',
 'Var72',
 'Var73',
 'Var74',
 'Var76',
 'Var78',
 'Var81',
 'Var83',
 'Var85',
 'Var88',
 'Var94',
 'Var106',
 'Var109',
 'Var111',
 'Var112',
 'Var113',
 'Var117',
 'Var119',
 'Var123',
 'Var124',
 'Var125',
 'Var126',
 'Var132',
 'Var133',
 'Var134',
 'Var138',
 'Var140',
 'Var143',
 'Var144',
 'Var149',
 'Var153',
 'Var160',
 'Var163',
 'Var173',
 'Var177',
 'Var180',
 'Var181',
 'Var188',
 'Var189',
 'Var191=NA',
 'Var194=NA',
 'Var196=z3mO',
 'Var203=9_Y1',
 'Var203=HLqf',
 'Var203=NA',
 'Var205=VpdQ',
 'Var205=sJzTlal',
 'Var207=5iay',
 'Var207=7M47J5GA0pTYIFxg5uy',
 'Var207=DHn_WUyBhW_whjA88g9bvA64_',
 'Var207=me75fM6ugJ',
 'Var210=3av_',
 'Var210=g5HH',
 'Var210=oT7d',
 'Var210=uKAI',
 'Var211=L84s',
 'Var213=KdSa',
 'Var215=NA',
 'Var218=NA',
 'Var218=UYBR',
 'Var218=cJvF',
 'Var219

In [559]:
# получаем нужные признаки из обучения
name_col_numb = columns_top100[:58]
name_col_categ = columns_top100[58:]

# Предобрабатываем тестовый набор
print test_data.shape

# отделяем числовые и категориальные признаки и удаляем полностью пустые признаки
# data_numb_test = test_data.iloc[:,0:190].dropna(axis=1, how='all')
# data_numb_test = test_data[name_col_numb]
data_numb_test = test_data[data_numb.columns]
data_categ_test = test_data.iloc[:,190:]

# data_dummies = pd.get_dummies(data_categ)


(10000, 230)


In [560]:
data_numb_test.shape

(10000, 174)

In [561]:
# ВЕЩЕСТВЕННЫЕ признаки
numeric_means_test = data_numb_test.mean(axis=0, skipna=True)
# Заполним пропущенные численные значения средними
data_numb_test = data_numb_test.fillna(numeric_means_test, axis=0)

data_numb_test =pd.DataFrame(scaler.transform(data_numb_test.values), index=data_numb_test.index, columns=data_numb_test.columns)

# data_numb_test.fillna(0, inplace=True)

In [562]:
# оставляем только важные признаки
data_numb_test = data_numb_test[name_col_numb]

In [563]:
# КАТЕГОРИАЛЬНЫЕ ПРИЗНАКИ
# берем только колонки используемые в обучении
# data_categ_test = data_categ_test[data_categ.columns]
data_categ_test = data_categ_test.fillna('NA').applymap(lambda s: str(s))


# удалим те колонки где больше 20 категорий и меньше 2 (эти колонки не информативны)
# name_del = [name for name, var in data_categ_test.iteritems() if var.value_counts(dropna=True).shape[0] > 20 or var.value_counts(dropna=True).shape[0] < 2]
# data_categ_test = data_categ_test.drop(labels=name_del, axis=1)

In [564]:
print data_numb_test.shape
print data_categ_test.shape

(10000, 58)
(10000, 40)


In [565]:
# кодируем категориальные признаки
# data_dummies_test = pd.get_dummies(data_categ_test)

data_dummies_test = encoder.transform(data_categ_test.T.to_dict().values())
# data_dummies_test = np.where(data_dummies_test == np.nan, data_dummies_test, 0)

In [566]:
data_dummies_test

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [567]:
list_ind = [ind for ind, el in enumerate(encoder.feature_names_) if el in name_col_categ]

In [568]:
# берём только колонки которые вошли в топ 100
data_dummies_test = data_dummies_test[:, list_ind]

In [569]:
print data_numb_test.shape
print data_dummies_test.shape

(10000, 58)
(10000, 42)


In [570]:
# объединяем числовые признаки и закодированные категориальные
test_data = np.hstack((data_numb_test.values, data_dummies_test))

In [571]:
print test_data.shape

(10000, 100)


In [572]:
data_dummies_test[:1]

array([[1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 1., 0., 0., 0., 0., 0., 1., 0.]])

In [573]:
test_labels = clf.predict(test_data)
test_labels_proba = clf.predict_proba(test_data)

## Сохраняем результат для Kaggle

In [574]:
df = pd.DataFrame(test_labels_proba[:,1], columns=['result'])  
df.index.name = 'ID'
# df['index'] = df.index
# df.astype({"ID": int, "result": float})

In [575]:
# np.savetxt('output.csv', df, delimiter=',', fmt='%f', header='ID, result')
df.to_csv('output_gb.csv', index=True)

## Случайный лес (не использовался, пропускать)

In [329]:
from sklearn.ensemble import RandomForestClassifier

# max_depth - максимальная глубина
# learning_rate - насколько сильно каждое дерево будет пытаться исправить ошибки предыдущих деревьев.
# min_samples_leaf - Ограничение на число объектов в листьях
parameters_grid = {
    'n_estimators' : range(70, 130, 10),
    'max_depth': range(2, 8, 1),
    'max_features': ['sqrt', 'log2', 10, 15, 20],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy']
}
# n_jobs=-1: строить на максимально возможном числе процессоров
# Будем использовать метод стратификации который делит соотношение классов в обучающей выборке на равное количество
skf = model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
classifier = RandomForestClassifier(random_state=0, class_weight = 'balanced', n_jobs=-1)
grid_rfc = model_selection.GridSearchCV(classifier, parameters_grid, scoring = 'roc_auc', cv = skf)

In [330]:
%%time
grid_rfc.fit(train_data, train_labels)

  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


CPU times: user 53min 6s, sys: 2min 21s, total: 55min 27s
Wall time: 1h 41min 2s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=-1, oob_score=False,
            random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [70, 80, 90, 100, 110, 120], 'max_features': ['sqrt', 'log2', 10, 15, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 6, 7], 'min_samples_leaf': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [331]:
grid_rfc.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=7, max_features=15,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=120, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [332]:
print grid_rfc.best_score_
print grid_rfc.best_params_

0.8924879337816848
{'max_features': 15, 'n_estimators': 120, 'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 1}


In [351]:
# вывести 5 лучших результатов
pd.DataFrame(grid_rfc.cv_results_).sort_values(by = ['mean_test_score'], ascending = [False]).head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,params,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
341,1.811287,0.110419,0.892488,0.902574,gini,7,15,1,120,"{u'max_features': 15, u'n_estimators': 120, u'...",...,0.889195,0.90524,0.891336,0.901799,0.891492,0.901468,0.234767,0.002839,0.002382,0.00141
340,2.25752,0.159539,0.892409,0.902353,gini,7,15,1,110,"{u'max_features': 15, u'n_estimators': 110, u'...",...,0.889159,0.905136,0.891037,0.901514,0.891418,0.901296,0.633057,0.042878,0.002425,0.00143
338,1.360242,0.158142,0.892244,0.902092,gini,7,15,1,90,"{u'max_features': 15, u'n_estimators': 90, u'c...",...,0.888494,0.904592,0.890898,0.901091,0.891957,0.901484,0.066677,0.002878,0.002472,0.001298
347,1.722359,0.115429,0.892205,0.902219,gini,7,15,2,120,"{u'max_features': 15, u'n_estimators': 120, u'...",...,0.890335,0.905915,0.891106,0.901407,0.891459,0.901505,0.10717,0.005763,0.001753,0.001951
339,1.300365,0.151131,0.892064,0.90207,gini,7,15,1,100,"{u'max_features': 15, u'n_estimators': 100, u'...",...,0.888496,0.904749,0.890673,0.901019,0.891347,0.901177,0.017586,0.014723,0.002504,0.001384


In [352]:
# строим модель с оптимальными параметрами которые удалось подобрать
clf2 = RandomForestClassifier(max_features= 15, 
                              n_estimators= 90, 
                              criterion= 'gini', 
                              max_depth= 7, 
                              min_samples_leaf= 1, 
                              class_weight = 'balanced', 
                              n_jobs=-1)

clf2.fit(train_data, train_labels)

  # Remove the CWD from sys.path while we load stuff.


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=7, max_features=15,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=90, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [353]:
# проверяем метрики на тренировочном наборе
actual_labels2 = clf2.predict(train_data)

In [354]:
print "AUC_ROC =", metrics.roc_auc_score(train_labels, actual_labels2)
print "accuracy =", clf2.score(train_data, train_labels)
print "precision =", metrics.precision_score(train_labels, actual_labels2)
print "recall =", metrics.recall_score(train_labels, actual_labels2)
print "f1 =", metrics.f1_score(train_labels, actual_labels2)

AUC_ROC = 0.8080813007103829
accuracy = 0.8082767733797825
precision = 0.8106696511844805
recall = 0.7961165048543689
f1 = 0.8033271719038817


In [355]:
# важность признаков
clf2.feature_importances_

array([1.22415677e-02, 1.76032893e-03, 6.33748705e-04, 3.27039865e-04,
       4.33493921e-02, 2.14904315e-03, 9.48536278e-03, 1.75662062e-04,
       8.75451571e-04, 6.98228478e-03, 2.13701875e-01, 4.38740465e-02,
       7.75976167e-03, 1.53237631e-02, 3.98628724e-03, 3.29728433e-03,
       1.85406399e-03, 2.20321031e-02, 2.04275010e-02, 2.65876504e-01,
       3.77895374e-03, 7.89218908e-03, 9.68395427e-04, 2.97005635e-02,
       2.29238042e-03, 1.65723673e-03, 6.90042658e-05, 9.72867813e-03,
       8.96927545e-03, 1.30630978e-03, 4.69150155e-03, 4.91811871e-03,
       5.44928196e-05, 4.82206898e-03, 3.98901665e-02, 3.18548579e-02,
       4.15367198e-03, 3.07792343e-04, 3.09219303e-04, 1.33696018e-02,
       6.23673508e-03, 1.62859069e-05, 9.11715459e-04, 1.96676299e-04,
       7.88442874e-02, 4.92098494e-03, 7.31778818e-03, 3.11854800e-04,
       0.00000000e+00, 5.43961224e-02])

In [356]:
# ТЕСТОВЫЙ НАБОР
test_labels2 = clf2.predict(test_data)
test_labels_proba2 = clf2.predict_proba(test_data)

In [357]:
df = pd.DataFrame(test_labels_proba2[:,1], columns=['result'])  
df.index.name = 'ID'
df.to_csv('output.csv', index=True)

## xgboost
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [579]:
%%time
xgb_scoring = []
n_trees = [70, 75, 80, 85, 90]
list_max_depth = [4, 5, 6]
# list_n_child = [1, 2, 3]
list_learning_rate = [0.09, 0.1, 0.11]
for n_tree in n_trees:
    for n_depth in list_max_depth:
        for learn_rate in list_learning_rate:
            estimator = xgb.XGBClassifier(learning_rate=learn_rate, max_depth=n_depth, n_estimators=n_tree, min_child_weight=2, n_jobs=2, seed=0)
            score = model_selection.cross_val_score(estimator, train_data, train_labels, 
                                                     scoring = 'roc_auc', cv = 5)    
            xgb_scoring.append(score)

            print n_tree, n_depth, learn_rate, np.array(score).mean()
    
xgb_scoring = np.asmatrix(xgb_scoring)

SyntaxError: invalid syntax (<unknown>, line 8)

In [477]:
# NOTE: если использовать параметры больше, то всё переобучается
xgb_scoring.mean(axis=1).max()

0.9308558058712322

In [576]:
clf3 = xgb.XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=75, min_child_weight=2, n_jobs=2, seed=0)
clf3.fit(train_data, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=75,
       n_jobs=2, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1)

In [577]:
# проверяем метрики на тренировочном наборе
actual_labels3 = clf3.predict(train_data)

In [578]:
print "AUC_ROC =", metrics.roc_auc_score(train_labels, actual_labels3)
# print "accuracy =", clf3.score(train_data, train_labels)
print "precision =", metrics.precision_score(train_labels, actual_labels3)
print "recall =", metrics.recall_score(train_labels, actual_labels3)
print "f1 =", metrics.f1_score(train_labels, actual_labels3)

AUC_ROC = 0.866813160245906
precision = 0.8786189420335762
recall = 0.8468583989741711
f1 = 0.8624463652757913


In [438]:
# test_labels = clf.predict(test_data)
test_labels_proba3 = clf3.predict_proba(test_data)

In [439]:
df = pd.DataFrame(test_labels_proba3[:,1], columns=['result'])  
df.index.name = 'ID'
df.to_csv('output.csv', index=True)

In [440]:
clf3.classes_

array([-1.,  1.])