In [1]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor




In [2]:

# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
pra

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

RandomizedSearchCV took 5.49 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.920 (std: 0.018)
Parameters: {'min_samples_leaf': 4, 'criterion': 'gini', 'min_samples_split': 7, 'bootstrap': False, 'max_depth': None, 'max_features': 6}

Model with rank: 2
Mean validation score: 0.919 (std: 0.021)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.918 (std: 0.011)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 5}

GridSearchCV took 55.57 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.932 (std: 0.016)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 2
Mean validation score: 0.92

In [3]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [16]:
param_dist = {
    'randomForest': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]},
    'gradientBoosting': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}    
}

In [17]:
param_dist

{'gradientBoosting': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5359e8>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x107689a58>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535c88>},
 'randomForest': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5354e0>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535860>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5355f8>}}

In [19]:
for key, val in param_dist.items():
    print(key)
    print(val)
    
    grid_search = GridSearchCV(clf, param_grid=param_grid)
    grid_search.fit(X, y)

gradientBoosting
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107689a58>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535c88>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5359e8>}
randomForest
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535860>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5355f8>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5354e0>}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(nthread = -1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="rmse", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)




In [None]:
model = XGBClassifier(nthread=-1)
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='rmse', n_jobs=-1)
print(results.mean())

In [23]:
X_train = pd.read_csv("../train_x.csv")
y_train = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")

In [39]:
num_folds = 10
num_instances = len(X_train)
seed = 10

In [48]:
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='mean_squared_error', n_jobs=-1)
print(results.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


-0.000201395318266


In [None]:
clf = RandomForestRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, 10, 15, 20],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
start = time()
grid_search.fit(X_train.values, np.concatenate(y_train.values))



print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
grid_search.best_score_

In [57]:
np.array(y_train)

array([[ 12.24769432],
       [ 12.10901093],
       [ 12.31716669],
       ..., 
       [ 12.49312952],
       [ 11.86446223],
       [ 11.90158345]])

In [62]:
np.concatenate(y_train.values)

array([ 12.24769432,  12.10901093,  12.31716669, ...,  12.49312952,
        11.86446223,  11.90158345])

In [6]:
param = {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 1, 'bootstrap': False, 'min_samples_split': 1}

In [10]:
clf = RandomForestRegressor(n_estimators=500, criterion = 'mse', max_depth = 10, max_features = 'auto', min_samples_leaf = 1, bootstrap = False,  min_samples_split =1)

In [3]:
X = pd.read_csv("../train_x2.csv")
y = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:



num_folds = 10
num_instances = len(X_train)
seed = 10

# run grid search
# grid_search = GridSearchCV(clf, param_grid=param, n_jobs = -1, cv= 8)
start = time()
clf.fit(X_train.values, np.concatenate(y_train.values))



RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [13]:
result = clf.predict(X_test)

array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])

In [16]:
sample = pd.read_csv("../rawData/sample_submission.csv")

In [22]:
np.exp(result)

array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])

In [20]:
sample['SalePrice'] = np.exp(result)

In [24]:
sample.to_csv("sixth_submission.csv")

In [36]:
np.concatenate(y_train.values)

array([ 12.03765399,  12.66191396,  11.88793137, ...,  12.49312952,
        11.87756858,  12.3327053 ])

In [12]:
# clf = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=0).fit(X_train, np.concatenate(y_train.values))
clf = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
            max_leaf_nodes=None, max_features = 2,
min_samples_leaf=10,
 min_weight_fraction_leaf=0.0,
           n_estimators=3000, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [13]:
clf.fit(X_train.values, np.concatenate(y_train.values))


RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
est = clf.predict(X_test)

In [15]:
np.sqrt(np.mean(np.square((est - y_test.values))))

0.47418450600462808

array([ 12.23069555,  12.55672952,  12.5538817 ,  12.32374455,
        11.69107165,  11.36210258,  11.87057003,  11.81303006,
        11.88103479,  12.2713765 ,  11.74006104,  11.77528973,
        11.76745137,  12.06968002,  11.79810441,  11.7905572 ,
        11.6784399 ,  12.04941884,  12.07539432,  12.08953883,
        11.97019203,  11.81303006,  12.32829028,  12.72188581,
        12.5776362 ,  11.84934899,  12.30817787,  12.20667496,
        12.37158708,  12.46843691,  12.6698706 ,  11.77143616,
        12.14685329,  12.38833467,  12.33929149,  11.49272276,
        12.45097769,  11.8277362 ,  11.33260191,  12.25486281,
        11.83132724,  12.27834162,  11.30220443,  11.9249247 ,
        11.6351431 ,  12.14950229,  12.10052689,  11.82020422,
        12.33691811,  12.14153412,  12.77705219,  12.32374455,
        12.10901093,  12.04348899,  12.02718519,  12.13242607,
        12.23069555,  11.90158345,  12.03152079,  11.9639392 ,
        12.29778545,  11.68671249,  12.09737323,  12.06