In [1]:
# !pip install ngboost
# !pip install xgboost

In [2]:
# import packages
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from ngboost.ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.distns import Normal
from ngboost.scores import MLE
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss,mean_squared_error
from math import sqrt

# for display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
from pathlib import Path
data_path = Path("../src/resources/data/")

In [4]:
train, test = pd.read_csv(data_path/"train.csv"), pd.read_csv(data_path/"train.csv")
train, test = train.drop(['Id'],1), test.drop(['Id'],1)
train = train[train['GrLivArea']<4500]
train['SalePrice'] = np.log1p(train['SalePrice'])
y = train['SalePrice']

train_features = train.drop(['SalePrice'],1)
test_features = test

features = pd.concat([train_features, test_features]).reset_index(drop=True)

objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)
features.update(features[objects].fillna('None'))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))

features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])


features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

final_features = pd.get_dummies(features).reset_index(drop=True)

X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(y):, :]

outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_sub = X_sub.drop(overfit, axis=1)

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)


alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

X['SalePrice'] = y
X = X.dropna()
X_tr, X_val = train_test_split(X,test_size=.2,random_state=42)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


## XGBoost

In [5]:
# XGBoost
params = {'max_depth': 4, 'eta': 0.01, 'objective':'reg:squarederror', 'eval_metric':['rmse'],'booster':'gbtree', 'verbosity':0,'sample_type':'weighted','max_delta_step':4, 'subsample':.5, 'min_child_weight':100,'early_stopping_round':50}
dtr, dte = xgb.DMatrix(X_tr.drop(['SalePrice'],1),label=X_tr.SalePrice), xgb.DMatrix(X_val.drop(['SalePrice'],1),label=X_val.SalePrice)
num_round = 5000
xgbst = xgb.train(params,dtr,num_round,verbose_eval=500)
y_pred = xgbst.predict(dte)
sqrt(mean_squared_error(X_val.SalePrice,y_pred)) # 0.00361

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


0.3807468069400263

## LightGBM

In [6]:
# LightGBM
ltr = lgb.Dataset(X_tr,label=X_tr.SalePrice)
param = {
'bagging_freq': 5,
'bagging_fraction': 0.6,
'bagging_seed': 123,
'boost_from_average':'false',
'boost': 'gbdt',
'feature_fraction': 0.3,
'learning_rate': .01,
'max_depth': 3,
'metric':'rmse',
'min_data_in_leaf': 128,
'min_sum_hessian_in_leaf': 8,
'num_leaves': 128,
'num_threads': 8,
'tree_learner': 'serial',
'objective': 'regression',
'verbosity': -1,
'random_state':123,
'max_bin': 8,
'early_stopping_round':100
}
lgbm = lgb.train(param,ltr,num_boost_round=10000,valid_sets=[(ltr)],verbose_eval=1000)
y_pred = lgbm.predict(X_val.drop(['SalePrice'],1))
sqrt(mean_squared_error(X_val.SalePrice,y_pred)) # 0.00494



Training until validation scores don't improve for 100 rounds
[1000]	training's rmse: 0.166311
[2000]	training's rmse: 0.158481
[3000]	training's rmse: 0.154074
[4000]	training's rmse: 0.1503
[5000]	training's rmse: 0.146989
[6000]	training's rmse: 0.144245
[7000]	training's rmse: 0.141719
[8000]	training's rmse: 0.139307
[9000]	training's rmse: 0.13716
[10000]	training's rmse: 0.135174
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.135174


0.5156550735914067

## CatBoost

In [40]:
cat_features  = []
for id, name in enumerate(X_tr.columns.values):
    if X_tr[name].nunique() < 10: 
        cat_features = cat_features+[int(id)]

In [41]:
# https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb
cat_features = np.where(X_tr.dtypes != np.float)[0]

```python
# cat_features must be integer or string, real number values and NaN values should be converted to string.

CatBoostError: c:/goagent/pipelines/buildmaster/catboost.gittt/catboost/private/libs/target/target_converter.cpp:64: Unknown class name: "12.27022515"
```

产生的两个报错。

In [43]:
from catboost import CatBoostClassifier
ctb = CatBoostClassifier(iterations=10,
                          random_seed=0,
                          learning_rate=0.1,
                          eval_metric = "AUC"
                          )
ctb.fit(X_tr.drop(['SalePrice'],1), X_tr.SalePrice,
#          cat_features=cat_features[0],
         eval_set=(X_val.drop(['SalePrice'],1), X_val.SalePrice)
         )

CatBoostError: c:/goagent/pipelines/buildmaster/catboost.gittt/catboost/private/libs/target/target_converter.cpp:64: Unknown class name: "12.27022515"

## NGBoost

In [None]:
# NGBoost
ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True,verbose=False)
ngboost = ngb.fit(np.asarray(X_tr.drop(['SalePrice'],1)), np.asarray(X_tr.SalePrice))
y_pred = ngb.predict(X_val.drop(['SalePrice'],1))
sqrt(mean_squared_error(X_val.SalePrice,y_pred)) # 0.003389

到目前为止表现最好。

In [None]:
# see the probability distributions by visualising
Y_dists = ngb.pred_dist(X_val.drop(['SalePrice'],1))
y_range = np.linspace(min(X_val.SalePrice), max(X_val.SalePrice), 200)

```python
dist_values = Y_dists.pdf(y_range).transpose()
# plot index 0 and 114
idx = 114
plt.plot(y_range,dist_values[idx])
plt.title(f"idx: {idx}")
plt.tight_layout()
plt.show()
```

这是报错的。