In [30]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 1000)

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

## check target class
train['target'].value_counts(normalize=True)

from sklearn.model_selection import train_test_split
import xgboost as xgb

feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

X = train[feature_names]
X_submission = test[feature_names]
y = target


np.random.seed(0)  # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False
skf = list(StratifiedKFold(y, n_folds))
k = [0,1,2]

clfs = [ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
print ("Creating train and test sets for blending.")

dataset_blend_train = np.zeros((X.shape[0], 3*len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], 3*len(clfs)))

for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], 5*len(skf)))
        for i, (train0, test0) in enumerate(skf):
            print ("Fold", i)
            print(train0,test0)
            X_train = X.iloc[train0]
            y_train = y.iloc[train0]
            X_test = X.iloc[test0]
            y_test = y.iloc[test0]
            clf.fit(X_train, y_train)
            for q in k:
                y_submission = clf.predict_proba(X_test)[:, q]
                dataset_blend_train[test0, (3*j)+q] = y_submission
                dataset_blend_test_j[:, (3*i)+q] = clf.predict_proba(X_submission)[:, q]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    
print ("Blending.")
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
clf.score(dataset_blend_train,y)
# y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

The train data has 169307 rows and 43 columns
The test data has 91166 rows and 42 columns
Creating train and test sets for blending.
0 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Fold 0
[ 16743  16746  16750 ..., 169304 169305 169306] [    0     1     2 ..., 17045 17046 17047]
Fold 1
[     0      1      2 ..., 169304 169305 169306] [16743 16746 16750 ..., 34046 34047 34049]
Fold 2
[     0      1      2 ..., 169304 169305 169306] [33321 33322 33331 ..., 51046 51048 51051]
Fold 3
[     0      1      2 ..., 169304 169305 169306] [50148 50153 50156 ..., 68124 68125 68126]
Fold 4
[     0      1      2 ..., 169304 169305 169306] [66793 6

ValueError: X has 41 features per sample; expecting 15

In [34]:
newt = pd.DataFrame(dataset_blend_train)
print(newt)

              0    1         2         3     4         5         6    7   \
0       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
1       0.724915  0.0  0.275085  0.725297  0.00  0.274703  0.724357  0.0   
2       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
3       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
4       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
5       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
6       0.000000  1.0  0.000000  0.000000  1.00  0.000000  0.000000  1.0   
7       0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
8       0.790000  0.0  0.210000  0.699643  0.00  0.300357  0.798333  0.0   
9       0.000000  1.0  0.000000  0.000000  1.00  0.000000  0.000000  1.0   
10      0.729047  0.0  0.270953  0.729331  0.00  0.270669  0.728850  0.0   
11      0.000000  1.0  0.000000  0.000000  1.00  0.000000  0.000000  1.0   
12      0.74

In [36]:
newtest = pd.DataFrame(dataset_blend_test_j)
print(newtest)

             0         1         2         3         4         5         6   \
0      0.020510  0.962695  0.016794  0.020288  0.963301  0.016410  0.020298   
1      0.708905  0.034664  0.256431  0.708913  0.034679  0.256408  0.710044   
2      0.708905  0.034664  0.256431  0.709164  0.034649  0.256187  0.710090   
3      0.709273  0.034622  0.256105  0.712229  0.034523  0.253248  0.709672   
4      0.708905  0.034664  0.256431  0.709164  0.034649  0.256187  0.710090   
5      0.708905  0.034664  0.256431  0.709164  0.034649  0.256187  0.710090   
6      0.705910  0.034471  0.259619  0.691644  0.034800  0.273556  0.700978   
7      0.020510  0.962695  0.016794  0.020288  0.963301  0.016410  0.020298   
8      0.708905  0.034664  0.256431  0.709164  0.034649  0.256187  0.710090   
9      0.708905  0.034664  0.256431  0.709164  0.034649  0.256187  0.710090   
10     0.709273  0.034622  0.256105  0.709164  0.034649  0.256187  0.709672   
11     0.020510  0.962695  0.016794  0.020288  0.963

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier

pd.set_option('display.max_columns', 1000)

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

## check target class
train['target'].value_counts(normalize=True)

from sklearn.model_selection import train_test_split
import xgboost as xgb

feature_names = [x for x in train.columns if x not in ['connection_id','target']]
target = train['target']

X = train[feature_names]
X_submission = test[feature_names]
y = target


np.random.seed(0)  # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False
skf = list(StratifiedKFold(y, n_folds))
k = [0,1,2]

clfs = [ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
print ("Creating train and test sets for blending.")

dataset_blend_train = np.zeros((X.shape[0], 3*len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], 10*3*len(clfs)))

for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], 3*len(skf)))
        for i, (train0, test0) in enumerate(skf):
            print ("Fold", i)
            print(train0,test0)
            X_train = X.iloc[train0]
            y_train = y.iloc[train0]
            X_test = X.iloc[test0]
            y_test = y.iloc[test0]
            clf.fit(X_train, y_train)
            for q in k:
                y_submission = clf.predict_proba(X_test)[:, q]
                dataset_blend_train[test0, (3*j)+q] = y_submission
                dataset_blend_test_j[:,(3*i)+q] = clf.predict_proba(X_submission)[:, q]
            if i==9:
                list_col = [x for x in range(30*j,(30*j)+30,1)]
                dataset_blend_test[:,list_col] = dataset_blend_test_j

import pickle 
pickle.dump(dataset_blend_train,open('dataset_blend_train.pickle','wb'))
pickle.dump(dataset_blend_test,open('dataset_blend_test.pickle','wb'))
    
print ("Blending.")
clf = XGBClassifier()
clf.fit(dataset_blend_train, y)
clf.score(X,y)
# y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

The train data has 169307 rows and 43 columns
The test data has 91166 rows and 42 columns
Creating train and test sets for blending.
0 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Fold 0
[ 16743  16746  16750 ..., 169304 169305 169306] [    0     1     2 ..., 17045 17046 17047]
Fold 1
[     0      1      2 ..., 169304 169305 169306] [16743 16746 16750 ..., 34046 34047 34049]
Fold 2
[     0      1      2 ..., 169304 169305 169306] [33321 33322 33331 ..., 51046 51048 51051]
Fold 3
[     0      1      2 ..., 169304 169305 169306] [50148 50153 50156 ..., 68124 68125 68126]
Fold 4
[     0      1      2 ..., 169304 169305 169306] [66793 6

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14'] ['cont_1', 'cont_2', 'cont_3', 'cont_4', 'cont_5', 'cont_6', 'cont_7', 'cont_8', 'cont_9', 'cont_10', 'cont_11', 'cont_12', 'cont_13', 'cont_14', 'cont_15', 'cont_16', 'cont_17', 'cont_18', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_20', 'cat_21', 'cat_22', 'cat_23']
expected f11, f12, f0, f14, f6, f7, f5, f3, f1, f9, f4, f10, f13, f2, f8 in input data
training data did not have the following fields: cat_19, cont_16, cont_6, cat_12, cat_23, cont_9, cat_9, cont_10, cont_18, cat_7, cat_5, cat_13, cont_2, cont_15, cat_2, cat_17, cat_18, cat_15, cont_5, cat_3, cont_13, cont_7, cont_17, cat_21, cat_14, cat_1, cat_16, cont_1, cont_4, cat_6, cat_20, cat_11, cont_3, cat_8, cat_4, cont_8, cont_12, cont_14, cat_22, cont_11, cat_10

In [3]:
clf.score(dataset_blend_train,y)

0.78120810125984164

In [4]:
xgc = XGBClassifier()
xgc.fit(train[feature_names],target)
xgc.score(train[feature_names],target)

0.7801862888126303

In [8]:
we = pd.DataFrame(dataset_blend_test)
print(we)

            0    1         2         3    4         5         6    7    \
0      0.000000  1.0  0.000000  0.000000  1.0  0.000000  0.000000  1.0   
1      0.724915  0.0  0.275085  0.725166  0.0  0.274834  0.725521  0.0   
2      0.729047  0.0  0.270953  0.729268  0.0  0.270732  0.729061  0.0   
3      0.743590  0.0  0.256410  0.750000  0.0  0.250000  0.742857  0.0   
4      0.729047  0.0  0.270953  0.729268  0.0  0.270732  0.729061  0.0   
5      0.729047  0.0  0.270953  0.729268  0.0  0.270732  0.729061  0.0   
6      0.860000  0.0  0.140000  0.910000  0.0  0.090000  0.900000  0.0   
7      0.000000  1.0  0.000000  0.000000  1.0  0.000000  0.000000  1.0   
8      0.729047  0.0  0.270953  0.729268  0.0  0.270732  0.729061  0.0   
9      0.729047  0.0  0.270953  0.729268  0.0  0.270732  0.729061  0.0   
10     0.724593  0.0  0.275407  0.727137  0.0  0.272863  0.726629  0.0   
11     0.000000  1.0  0.000000  0.000000  1.0  0.000000  0.000000  1.0   
12     0.729047  0.0  0.270953  0.7292

In [9]:
xgc.get_params()

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'nthread': -1,
 'objective': 'multi:softprob',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 1}