In [1]:
import os
import time
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, ShuffleSplit, RandomizedSearchCV
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBRegressor

In [3]:
def rmsle(h, y):
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)

In [4]:
df_train = pd.read_csv("../data/train_additional.csv")
df_test = pd.read_csv("../data/test_additional.csv")

In [5]:
features = list(set(df_train.columns.tolist()) - set(['formation_energy_ev_natom',
                                                'bandgap_energy_ev', 
                                                'id',
                                                'lattice_angle_alpha_degree_r',
                                                'lattice_angle_beta_degree_r',
                                                'lattice_angle_alpha_degree_r']))
targets = ['formation_energy_ev_natom', 'bandgap_energy_ev']

y_e = df_train['formation_energy_ev_natom']
y_be = df_train['bandgap_energy_ev']

X = df_train[features]

In [6]:
dist_features = [x for x in df_train.columns.tolist() if x.startswith('dist')]
path_features = [x for x in df_train.columns.tolist() if x.startswith('path')]
angle_features = [x for x in df_train.columns.tolist() if x.startswith('angle')]
dihedral_features = [x for x in df_train.columns.tolist() if x.startswith('dihedral')]
percent_features = [x for x in df_train.columns.tolist() if x.startswith('percent')]
avg_features = [x for x in df_train.columns.tolist() if x.startswith('avg')]
lattice_angle_features = [x for x in df_train.columns.tolist() if x.startswith('lattice_angle')]

In [7]:
fe_be=['avg_HOMO',
 'avg_mass',
 'avg_IP',
 'percent_atom_in',
 'percent_atom_al',
 'percent_atom_ga',
 'dist_16',
 'dist_17',
 'dist_21',
 'dist_11',
 'dist_15',
 'dist_42',
 'dist_18',
 'dist_19',
 'path_20',
 'path_0',
 'path_40',
 'path_30',
 'path_49',
 'path_10',
 'angle_84',
 'angle_102',
 'angle_95',
 'angle_123',
 'angle_118',
 'angle_126',
 'angle_78',
 'angle_97',
 'angle_86',
 'angle_101',
 'angle_94',
 'angle_169',
 'angle_179',
 'dihedral_39',
 'dihedral_130',
 'dihedral_48',
 'dihedral_118',
 'dihedral_49',
 'dihedral_40',
 'dihedral_129',
 'dihedral_89',
 'dihedral_76',
 'dihedral_99',
 'dihedral_79',
 'dihedral_29',
 'dihedral_2',
 'dihedral_132',
 'dihedral_17',
 'dihedral_135',
 'dihedral_107',
 'dihedral_114',
 'A_8',
 'A_0',
 'A_2',
 'A_4',
 'A_1',
 'A_5',
 'A_7',
 'B_0',
 'B_8',
 'B_4',
 'B_2',
 'B_1',
 'B_3',
 'R_2',
 'R_1',
 'R_0',
 'r_2',
 'r_0',
 'r_1']

In [8]:
# features_be = ['lattice_vector_1_ang',
#                'lattice_vector_2_ang',
#                'lattice_vector_3_ang']
# X_be = df_train[features_be]

In [9]:
# features_be = ['lattice_vector_1_ang',
#                'lattice_vector_2_ang',
#                'lattice_vector_3_ang']
# X_be = df_train[features_be]

# X = StandardScaler().fit_transform(X_be)
# db = DBSCAN(eps=0.14, min_samples=10).fit(X)
# core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True
# labels = db.labels_

# # Number of clusters in labels, ignoring noise if present.
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# labels = db.labels_
# print('Estimated number of clusters: %d' % n_clusters_)
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, labels))

In [10]:
df_train = pd.read_csv("../data/train_additional.csv")
df_test = pd.read_csv("../data/test_additional.csv")

features_be = ['lattice_vector_1_ang',
               'lattice_vector_2_ang',
               'lattice_vector_3_ang']
X_be = df_train[features_be]
X_test_be = df_test[features_be]
frames = [X_be, X_test_be]
result = pd.concat(frames)

result_s = StandardScaler().fit_transform(result)
db = DBSCAN(eps=0.14, min_samples=10).fit(result_s)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels_both = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_both)) - (1 if -1 in labels_both else 0)
labels_both = db.labels_
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(result, labels_both))

train_labels = labels_both[:2400]
test_labels = labels_both[2400:]

Estimated number of clusters: 11
Silhouette Coefficient: 0.674


In [11]:
vector = np.vstack((df_train[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']].values,
                    df_test[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']].values))

pca = PCA().fit(vector)
df_train['xtrain_1'] = pca.transform(df_train[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 0]
df_test['xtrain_1'] = pca.transform(df_test[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 0]

df_train['ytrain_1'] = pca.transform(df_train[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 1]
df_test['ytrain_1'] = pca.transform(df_test[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 1]

df_train['ztrain_1'] = pca.transform(df_train[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 2]
df_test['ztrain_1'] = pca.transform(df_test[['lattice_vector_1_ang', 'lattice_vector_2_ang','lattice_vector_3_ang']])[:, 2]


In [12]:
df_test['t_x']= 0.5*(np.divide((df_test['percent_atom_al']+2*df_test['percent_atom_in']),(df_test['percent_atom_al']+df_test['percent_atom_in']+2*df_test['percent_atom_ga'])))
df_test['t_y']= 0.866*(np.divide((df_test['percent_atom_al']),(df_test['percent_atom_al']+df_test['percent_atom_in']+2*df_test['percent_atom_ga'])))

In [13]:
df_train['t_x']= 0.5*(np.divide((df_train['percent_atom_al']+2*df_train['percent_atom_in']),(df_train['percent_atom_al']+df_train['percent_atom_in']+2*df_train['percent_atom_ga'])))
df_train['t_y']= 0.866*(np.divide((df_train['percent_atom_al']),(df_train['percent_atom_al']+df_train['percent_atom_in']+2*df_train['percent_atom_ga'])))

In [17]:
label_dist = [306, 340, 247, 326, 490, 84, 48, 126, 274, 146, 12]

In [14]:
# when labels are in [0,1] we can also use reg:logistic
# Create a parameter grid to search for best parameters for everything in the pipeline
t = time.time()

param_grid = {
                #'min_child_weight': [10, 11], # due to high class imbalance
                'booster':['gbtree'],
                'objective': ['reg:linear'],
                'max_depth': [1,2,3,4],
                'nthread': [12],
                #'max_delta_step': [0.1,0.5,1],
                'subsample': np.linspace(0.8,0.99, 100),
                'learning_rate': np.linspace(0.01,0.2, 100), #0.025
                'gamma': np.linspace(0.01,0.2, 100),
                'n_estimators' : [150],
                #'tree_method': ['exact'],
                'silent': [1],
                'missing':[-999],
                'n_jobs': [12],
                #'updater': 'grow_gpu',
              }

# Normalized Gini Scorer
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)

# Initialize Grid Search Model
model = RandomizedSearchCV(estimator  = XGBRegressor(),
                           param_distributions = param_grid,
                           n_iter = 50,
                           #param_grid = param_grid,
                           scoring    = rmsle_scorer,
                           verbose    = 2,
                           n_jobs     = 12,
                           iid        = True,
                           refit      = True,
                           cv         = ShuffleSplit(n_splits=5, test_size=.2))
# Fit Grid Search Model
# X_gbx = df_train[['lattice_angle_alpha_degree',
#                  'lattice_angle_gamma_degree',
#                  'lattice_angle_beta_degree',
#                  'vol',
#                  'atomic_density',
#                  'lattice_vector_1_ang',
#                  'lattice_vector_2_ang',
#                  'lattice_vector_3_ang',
#                  't_x',
#                  't_y',
#                  'xtrain_1',
#                  'ytrain_1',
#                  'ztrain_1']
#                  +avg_features
#                  +percent_features
#                  +dist_features
#                  +angle_features
#                  +dihedral_features
#                  +path_features].values
features = ['vol',
            't_x',
            't_y',
            'xtrain_1',
            'ytrain_1',
            'ztrain_1',
            'lattice_vector_2_ang',
            'lattice_vector_3_ang',
            'lattice_vector_1_ang']+fe_be


X_gbx = df_train[features].values
y_gbx = np.log(y_be.values+1)


model.fit(X_gbx[train_labels==10], y_gbx[train_labels==10])
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Get best model
best_model = model.best_estimator_

# Fit model with best parameters optimized for normalized_gini
#best_model.fit(train,labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] subsample=0.849898989899, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0483838383838, gamma=0.0522222222222, booster=gbtree 
[CV] subsample=0.849898989899, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0483838383838, gamma=0.0522222222222, booster=gbtree 
[CV] subsample=0.849898989899, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0483838383838, gamma=0.0522222222222, booster=gbtree 
[CV] subsample=0.849898989899, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0483838383838, gamma=0.0522222222222, booster=gbtree 
[CV] subsample=0.849898989899, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, lea

[CV]  subsample=0.819191919192, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.173131313131, gamma=0.171212121212, booster=gbtree, total=  53.5s
[CV] subsample=0.922828282828, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.130909090909, gamma=0.0483838383838, booster=gbtree 
[CV]  subsample=0.970808080808, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.107878787879, gamma=0.115555555556, booster=gbtree, total= 1.5min
[CV] subsample=0.922828282828, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.130909090909, gamma=0.0483838383838, booster=gbtree 
[CV]  subsample=0.970808080808, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.107878787879, gamma=0.11

[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  2.0min


[CV]  subsample=0.819191919192, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.019595959596, gamma=0.117474747475, booster=gbtree, total= 1.1min
[CV] subsample=0.922828282828, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.130909090909, gamma=0.0483838383838, booster=gbtree 
[CV]  subsample=0.819191919192, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.019595959596, gamma=0.117474747475, booster=gbtree, total= 1.1min
[CV] subsample=0.924747474747, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0925252525253, gamma=0.188484848485, booster=gbtree 
[CV]  subsample=0.970808080808, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.107878787879, gamma=0.11

[CV]  subsample=0.924747474747, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0925252525253, gamma=0.188484848485, booster=gbtree, total= 1.5min
[CV] subsample=0.865252525253, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.019595959596, gamma=0.184646464646, booster=gbtree 
[CV]  subsample=0.924747474747, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0925252525253, gamma=0.188484848485, booster=gbtree, total= 1.5min
[CV] subsample=0.897878787879, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0752525252525, gamma=0.125151515152, booster=gbtree 
[CV]  subsample=0.924747474747, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0925252525253, gamma=0.

[CV]  subsample=0.976565656566, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0311111111111, gamma=0.107878787879, booster=gbtree, total=  54.6s
[CV] subsample=0.945858585859, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0867676767677, gamma=0.0368686868687, booster=gbtree 
[CV]  subsample=0.976565656566, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0311111111111, gamma=0.107878787879, booster=gbtree, total=  51.6s
[CV] subsample=0.907474747475, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0119191919192, gamma=0.0810101010101, booster=gbtree 
[CV]  subsample=0.976565656566, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0311111111111, gamma

[CV]  subsample=0.974646464646, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.10595959596, gamma=0.100202020202, booster=gbtree, total= 1.2min
[CV] subsample=0.899797979798, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0810101010101, gamma=0.196161616162, booster=gbtree 
[CV]  subsample=0.974646464646, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.10595959596, gamma=0.100202020202, booster=gbtree, total= 1.2min
[CV] subsample=0.857575757576, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.161616161616, gamma=0.119393939394, booster=gbtree 
[CV]  subsample=0.974646464646, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.10595959596, gamma=0.100202

[CV]  subsample=0.857575757576, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.161616161616, gamma=0.119393939394, booster=gbtree, total=  53.3s
[CV] subsample=0.872929292929, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.0522222222222, gamma=0.0426262626263, booster=gbtree 
[CV]  subsample=0.857575757576, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.161616161616, gamma=0.119393939394, booster=gbtree, total=  53.0s
[CV] subsample=0.928585858586, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.10404040404, gamma=0.140505050505, booster=gbtree 
[CV]  subsample=0.857575757576, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.161616161616, gamma=0.119

[CV]  subsample=0.955454545455, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.180808080808, gamma=0.173131313131, booster=gbtree, total=  34.0s
[CV] subsample=0.955454545455, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.180808080808, gamma=0.173131313131, booster=gbtree 
[CV]  subsample=0.874848484848, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.138585858586, gamma=0.0272727272727, booster=gbtree, total=  54.4s
[CV] subsample=0.836464646465, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.140505050505, gamma=0.0330303030303, booster=gbtree 
[CV]  subsample=0.874848484848, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.138585858586, gamma=0.02

[CV]  subsample=0.945858585859, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.127070707071, gamma=0.134747474747, booster=gbtree, total= 1.4min
[CV] subsample=0.907474747475, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0886868686869, gamma=0.059898989899, booster=gbtree 


[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed: 12.2min


[CV]  subsample=0.907474747475, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0886868686869, gamma=0.059898989899, booster=gbtree, total=  33.8s
[CV] subsample=0.915151515152, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.132828282828, gamma=0.134747474747, booster=gbtree 
[CV]  subsample=0.945858585859, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.127070707071, gamma=0.134747474747, booster=gbtree, total= 1.4min
[CV] subsample=0.915151515152, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.132828282828, gamma=0.134747474747, booster=gbtree 
[CV]  subsample=0.945858585859, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.127070707071, gamma=0.134

[CV]  subsample=0.915151515152, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.132828282828, gamma=0.134747474747, booster=gbtree, total=  52.7s
[CV] subsample=0.888282828283, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0829292929293, gamma=0.0963636363636, booster=gbtree 
[CV]  subsample=0.936262626263, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.127070707071, gamma=0.15202020202, booster=gbtree, total=  33.5s
[CV] subsample=0.888282828283, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0829292929293, gamma=0.0963636363636, booster=gbtree 
[CV]  subsample=0.846060606061, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.0407070707071, gamma=0.

[CV] subsample=0.805757575758, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.01, gamma=0.184646464646, booster=gbtree 
[CV]  subsample=0.949696969697, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.157777777778, gamma=0.0963636363636, booster=gbtree, total=  51.9s
[CV] subsample=0.805757575758, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.01, gamma=0.184646464646, booster=gbtree 
[CV]  subsample=0.842222222222, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0675757575758, gamma=0.01, booster=gbtree, total=  52.4s
[CV] subsample=0.805757575758, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.01, gamma=0.184646464646, booster=gbtree 
[CV]  subsamp

[CV]  subsample=0.861414141414, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.150101010101, gamma=0.0522222222222, booster=gbtree, total=  33.4s
[CV] subsample=0.888282828283, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0944444444444, gamma=0.140505050505, booster=gbtree 
[CV]  subsample=0.886363636364, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.0157575757576, gamma=0.161616161616, booster=gbtree, total= 1.1min
[CV] subsample=0.888282828283, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.0944444444444, gamma=0.140505050505, booster=gbtree 
[CV]  subsample=0.886363636364, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.0157575757576, gamma=0

[CV]  subsample=0.957373737374, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.188484848485, gamma=0.175050505051, booster=gbtree, total=  52.5s
[CV] subsample=0.930505050505, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.107878787879, gamma=0.0906060606061, booster=gbtree 
[CV]  subsample=0.844141414141, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=3, learning_rate=0.148181818182, gamma=0.125151515152, booster=gbtree, total= 1.2min
[CV] subsample=0.930505050505, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.107878787879, gamma=0.0906060606061, booster=gbtree 
[CV]  subsample=0.957373737374, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=2, learning_rate=0.188484848485, gamma=0.17

[CV]  subsample=0.890202020202, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learning_rate=0.192323232323, gamma=0.0790909090909, booster=gbtree, total= 1.4min
[CV]  subsample=0.865252525253, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0886868686869, gamma=0.0733333333333, booster=gbtree, total=  27.0s
[CV]  subsample=0.865252525253, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0886868686869, gamma=0.0733333333333, booster=gbtree, total=  28.4s
[CV]  subsample=0.865252525253, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=1, learning_rate=0.0886868686869, gamma=0.0733333333333, booster=gbtree, total=  23.6s
[CV]  subsample=0.890202020202, silent=1, objective=reg:linear, nthread=12, n_jobs=12, n_estimators=150, missing=-999, max_depth=4, learn

[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed: 20.9min finished


In [15]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Get best model
best_model = model.best_estimator_

Best score: -0.081
Best parameters set:
	booster: 'gbtree'
	gamma: 0.052222222222222225
	learning_rate: 0.1501010101010101
	max_depth: 1
	missing: -999
	n_estimators: 150
	n_jobs: 12
	nthread: 12
	objective: 'reg:linear'
	silent: 1
	subsample: 0.86141414141414141


In [16]:
(0.03+0.034+0.018+0.017+0.082+0.052+0.025+0.032+0.145)/10

0.043500000000000004

In [48]:
label_dist = [306, 340, 247, 326, 490, 84, 48, 126, 274, 146, 12]
score_dist = [0.016, 0.050, 0.078, 0.013, 0.015, 0.034, 0.019, 0.021, 0.026, 0.131, 0.081]

In [49]:
label_dist = np.divide(label_dist,[sum(label_dist)]*11)

In [50]:
sum(label_dist*score_dist)

0.030861192163401416

In [None]:
0 306 -0.0322860777739
1 340 -0.108012857757
2 247 -0.175791198151
3 326 -0.0251724518879
4 490 -0.0361405124695
5 84 -0.0609741645374
6 48 -0.0364656765402
7 126 -0.0337857595975
8 274 -0.0584636221721
9 146 -0.225781049921
10 12 -0.22090655717