In [1]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor




In [2]:

# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
pra

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

RandomizedSearchCV took 5.49 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.920 (std: 0.018)
Parameters: {'min_samples_leaf': 4, 'criterion': 'gini', 'min_samples_split': 7, 'bootstrap': False, 'max_depth': None, 'max_features': 6}

Model with rank: 2
Mean validation score: 0.919 (std: 0.021)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.918 (std: 0.011)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 5}

GridSearchCV took 55.57 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.932 (std: 0.016)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 2
Mean validation score: 0.92

In [2]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [16]:
param_dist = {
    'randomForest': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]},
    'gradientBoosting': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}    
}

In [17]:
param_dist

{'gradientBoosting': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5359e8>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x107689a58>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535c88>},
 'randomForest': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5354e0>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535860>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5355f8>}}

In [19]:
for key, val in param_dist.items():
    print(key)
    print(val)
    
    grid_search = GridSearchCV(clf, param_grid=param_grid)
    grid_search.fit(X, y)

gradientBoosting
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107689a58>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535c88>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5359e8>}
randomForest
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535860>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5355f8>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5354e0>}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(nthread = -1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="rmse", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)




In [None]:
model = XGBClassifier(nthread=-1)
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='rmse', n_jobs=-1)
print(results.mean())

In [5]:

X_train = pd.read_csv("../train_x2.csv")
y_train = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")

FileNotFoundError: File b'../train_x2.csv' does not exist

In [39]:
num_folds = 10
num_instances = len(X_train)
seed = 10

In [48]:
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='mean_squared_error', n_jobs=-1)
print(results.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


-0.000201395318266


In [None]:
clf = RandomForestRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, 10, 15, 20],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
start = time()
grid_search.fit(X_train.values, np.concatenate(y_train.values))



print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
grid_search.best_score_

In [57]:
np.array(y_train)

array([[ 12.24769432],
       [ 12.10901093],
       [ 12.31716669],
       ..., 
       [ 12.49312952],
       [ 11.86446223],
       [ 11.90158345]])

In [62]:
np.concatenate(y_train.values)

array([ 12.24769432,  12.10901093,  12.31716669, ...,  12.49312952,
        11.86446223,  11.90158345])

In [None]:
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
num_folds = 10
num_instances = len(X)
seed = 7
num_trees = 100
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())