This notebook is the fifth part in the data competition for predicting used car prices. This notebook will focus on the stacking and ensemble models and their respective performances regarding the dataset.

In [31]:
import numpy as np
import pandas as pd
import sklearn
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
%matplotlib inline

- simple example of averaging

In [2]:
# generate random dataset
test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

y_test_true = [1, 3, 2, 6]

In [3]:
# define a simple weighted average function
def weighted_method(test_pre1,test_pre2,test_pre3,w=[1/3,1/3,1/3]):
    weighted_result = w[0]*pd.Series(test_pre1)+w[1]*pd.Series(test_pre2)+w[2]*pd.Series(test_pre3)
    return weighted_result

In [5]:
from sklearn import metrics

print('Pred1 MAE:',metrics.mean_absolute_error(y_test_true, test_pre1))
print('Pred2 MAE:',metrics.mean_absolute_error(y_test_true, test_pre2))
print('Pred3 MAE:',metrics.mean_absolute_error(y_test_true, test_pre3))

Pred1 MAE: 0.1750000000000001
Pred2 MAE: 0.07499999999999993
Pred3 MAE: 0.10000000000000009


In [6]:
w = [0.3,0.4,0.3] # init weights
weighted_test = weighted_method(test_pre1, test_pre2, test_pre3, w)
print('Weighted_test MAE:', metrics.mean_absolute_error(y_test_true, weighted_test))

Weighted_test MAE: 0.05750000000000027


In [7]:
## similarly, we can define weighted mean/median function

In [8]:
def mean_method(test_pre1,test_pre2,test_pre3):
    mean_result = pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).mean(axis=1)
    return mean_result

In [9]:
def median_method(test_pre1,test_pre2,test_pre3):
    median_result = pd.concat([pd.Series(test_pre1),pd.Series(test_pre2),pd.Series(test_pre3)],axis=1).median(axis=1)
    return median_result

In [10]:
mean_test = mean_method(test_pre1,test_pre2,test_pre3)
print('Mean_test MAE:', metrics.mean_absolute_error(y_test_true, mean_test))

Mean_test MAE: 0.06666666666666693


In [11]:
median_test = median_method(test_pre1,test_pre2,test_pre3)
print('Median_pre MAE:', metrics.mean_absolute_error(y_test_true, median_test))

Median_pre MAE: 0.07500000000000007


Now use sklearn to build a simple stacking model.

In [14]:
from sklearn import linear_model

def stacking_method(train_reg1, train_reg2, train_reg3, y_train_true, 
                    test_pre1, test_pre2, test_pre3, 
                    model_L2 = linear_model.LinearRegression()):
    
    model_L2.fit(pd.concat([pd.Series(train_reg1), pd.Series(train_reg2), 
                            pd.Series(train_reg3)], axis=1).values,y_train_true)
    stacking_result = model_L2.predict(pd.concat([pd.Series(test_pre1), pd.Series(test_pre2),
                                                  pd.Series(test_pre3)], axis=1).values)
    return stacking_result

In [15]:
train_reg1 = [3.2, 8.2, 9.1, 5.2]
train_reg2 = [2.9, 8.1, 9.0, 4.9]
train_reg3 = [3.1, 7.9, 9.2, 5.0]

y_train_true = [3, 8, 9, 5] 

test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

y_test_true = [1, 3, 2, 6] 

In [16]:
model_L2= linear_model.LinearRegression()
stacking_test = stacking_method(train_reg1, train_reg2, train_reg3, y_train_true,
                               test_pre1, test_pre2, test_pre3, model_L2)

print('Stacking_pre MAE:',metrics.mean_absolute_error(y_test_true, stacking_test))

Stacking_pre MAE: 0.04213483146067476


There are different types of ensemble, such as the voting classifier as well as advanced ensemble models,

In [18]:
from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [24]:
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

#clf1 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=3, min_child_weight=2, subsample=0.7,
#                     colsample_bytree=0.6, objective='binary:logistic')
clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4,
                              min_samples_leaf=63,oob_score=True)
clf3 = SVC(C=0.1)


In [32]:
# hard voting system 
eclf = VotingClassifier(estimators=[ ('rf', clf2), ('svc', clf3)], voting='hard')


for clf, label in zip([clf2, clf3, eclf], ['Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
    
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.65 (+/- 0.18) [Ensemble]


In [33]:
# soft voting
clf3 = SVC(C=0.1, probability=True)
eclf = VotingClassifier(estimators=[('rf', clf2), ('svc', clf3)], voting='soft', weights=[1,2])

for clf, label in zip([clf2, clf3, eclf], ['Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
    
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.95 (+/- 0.03) [Ensemble]


### compare two ways: Stacking & Blending

In [34]:
'''
5-Fold Stacking
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier

data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]

clfs = [LogisticRegression(solver='lbfgs'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
 
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.3, random_state=2020)

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))


n_splits = 5
skf = StratifiedKFold(n_splits)
skf = skf.split(X, y)

for j, clf in enumerate(clfs):
    
    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    
    for i, (train, test) in enumerate(skf):
        # 5-fold cross training, use ith as prediction, 
        # train with the rest, output as new features to ith  
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    
    # for testing, use the pred of k models as new feature
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))

clf = LogisticRegression(solver='lbfgs')
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Val auc Score of Stacking: %f" % (roc_auc_score(y_predict, y_submission)))

val auc Score: 1.000000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
Val auc Score of Stacking: 1.000000


In [35]:
'''
Blending
'''
 
data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]
 
clfs = [LogisticRegression(solver='lbfgs'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        #ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
 
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.3, random_state=2020)

# split training set into d1, d2
X_d1, X_d2, y_d1, y_d2 = train_test_split(X, y, test_size=0.5, random_state=2020)
dataset_d1 = np.zeros((X_d2.shape[0], len(clfs)))
dataset_d2 = np.zeros((X_predict.shape[0], len(clfs)))
 
for j, clf in enumerate(clfs):
    clf.fit(X_d1, y_d1)
    y_submission = clf.predict_proba(X_d2)[:, 1]
    dataset_d1[:, j] = y_submission
    dataset_d2[:, j] = clf.predict_proba(X_predict)[:, 1]
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_d2[:, j]))

clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(dataset_d1, y_d2)

y_submission = clf.predict_proba(dataset_d2)[:, 1]
print("Val auc Score of Blending: %f" % (roc_auc_score(y_predict, y_submission)))

val auc Score: 1.000000
val auc Score: 1.000000
val auc Score: 1.000000
val auc Score: 1.000000
val auc Score: 1.000000
Val auc Score of Blending: 1.000000


Sometimes, we can also put all features into the model to make predictions, and transform the predicted results and add them back as new features, and then pass to the model to get new predictions.

In [37]:
def Ensemble_add_feature(train,test,target,clfs):
    
    # n_flods = 5
    # skf = list(StratifiedKFold(y, n_folds=n_flods))

    train_ = np.zeros((train.shape[0],len(clfs*2)))
    test_ = np.zeros((test.shape[0],len(clfs*2)))

    for j,clf in enumerate(clfs):

        # print(j, clf)
        # X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]

        clf.fit(train,target)
        y_train = clf.predict(train)
        y_test = clf.predict(test)

        ## create new features
        train_[:,j*2] = y_train**2
        test_[:,j*2] = y_test**2
        train_[:, j+1] = np.exp(y_train)
        test_[:, j+1] = np.exp(y_test)
        # print("val auc Score: %f" % r2_score(y_predict, dataset_d2[:, j]))
        print('Method ',j)
    
    train_ = pd.DataFrame(train_)
    test_ = pd.DataFrame(test_)
    return train_,test_

In [39]:
data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]

x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.3)
x_train = pd.DataFrame(x_train) ; x_test = pd.DataFrame(x_test)

clfs = [LogisticRegression(),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

New_train,New_test = Ensemble_add_feature(x_train,x_test,y_train,clfs)

# clf = LogisticRegression()
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(New_train, y_train)
y_emb = clf.predict_proba(New_test)[:, 1]

print("Val auc Score of stacking: %f" % (roc_auc_score(y_test, y_emb)))

Method  0
Method  1
Method  2
Method  3
Method  4
Val auc Score of stacking: 1.000000


In [40]:
# load the used car dataset
Train_data = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv('./data/used_car_testA_20200313.csv', sep=' ')

print(Train_data.shape)
print(Test_data.shape)

(150000, 31)
(50000, 30)


In [53]:
numerical_cols = Train_data.select_dtypes(exclude = 'object').columns
print(numerical_cols)

Index(['SaleID', 'power', 'kilometer', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'miss_info', 'used_time', 'notRepairedDamage_0.0',
       'notRepairedDamage_1.0', 'notRepairedDamage_nan', 'fuelType_0.0',
       'fuelType_1.0', 'fuelType_2.0', 'fuelType_3.0', 'fuelType_4.0',
       'fuelType_5.0', 'fuelType_6.0', 'fuelType_nan', 'gearbox_0.0',
       'gearbox_1.0', 'gearbox_nan', 'bodyType_0.0', 'bodyType_1.0',
       'bodyType_2.0', 'bodyType_3.0', 'bodyType_4.0', 'bodyType_5.0',
       'bodyType_6.0', 'bodyType_7.0', 'bodyType_nan'],
      dtype='object')


In [54]:
feature_cols = [col for col in numerical_cols if col not in ['SaleID','name','regDate','price']]
print(feature_cols)

['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'miss_info', 'used_time', 'notRepairedDamage_0.0', 'notRepairedDamage_1.0', 'notRepairedDamage_nan', 'fuelType_0.0', 'fuelType_1.0', 'fuelType_2.0', 'fuelType_3.0', 'fuelType_4.0', 'fuelType_5.0', 'fuelType_6.0', 'fuelType_nan', 'gearbox_0.0', 'gearbox_1.0', 'gearbox_nan', 'bodyType_0.0', 'bodyType_1.0', 'bodyType_2.0', 'bodyType_3.0', 'bodyType_4.0', 'bodyType_5.0', 'bodyType_6.0', 'bodyType_7.0', 'bodyType_nan']


In [43]:
def data_processing(df):

    Output = df.copy()

    category_cols = ['brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
    Output.replace('-', np.nan, inplace=True)
    Output.replace({'Not Available': np.nan}, inplace=True)
    Output['miss_info'] = Output.isnull().sum(axis=1)

    for col in list(Output.columns):
        if ('notRepairedDamage' in col):
            Output[col] = Output[col].astype(float)
    
    Output['used_time'] = (pd.to_datetime(Output['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(Output['regDate'], format='%Y%m%d', errors='coerce')).dt.days
    
    
    data_notRepairedDamage = pd.get_dummies(Output['notRepairedDamage'], prefix='notRepairedDamage', dummy_na=True)
    data_fuelType = pd.get_dummies(Output['fuelType'], prefix='fuelType', dummy_na=True)
    data_gearbox = pd.get_dummies(Output['gearbox'], prefix='gearbox', dummy_na=True)
    data_bodyType = pd.get_dummies(Output['bodyType'], prefix='bodyType', dummy_na=True)
    
    Output = pd.concat([Output, data_notRepairedDamage, data_fuelType, data_gearbox, data_bodyType], axis=1)
    
    del Output["seller"]
    del Output["offerType"]
    del Output['name']
    del Output['model']
    del Output['regionCode']
    del Output['regDate']
    del Output['brand']
    del Output['bodyType']
    del Output['fuelType']
    del Output['gearbox']
    del Output['notRepairedDamage']
    del Output['creatDate']
    
    return Output

In [44]:
Train_data = data_processing(Train_data)
Test_data = data_processing(Test_data)

print('Train data shape:',Train_data.shape)
print('Test data shape:',Test_data.shape)

Train data shape: (150000, 44)
Test data shape: (50000, 43)


In [45]:
Train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 44 columns):
SaleID                   150000 non-null int64
power                    150000 non-null int64
kilometer                150000 non-null float64
price                    150000 non-null int64
v_0                      150000 non-null float64
v_1                      150000 non-null float64
v_2                      150000 non-null float64
v_3                      150000 non-null float64
v_4                      150000 non-null float64
v_5                      150000 non-null float64
v_6                      150000 non-null float64
v_7                      150000 non-null float64
v_8                      150000 non-null float64
v_9                      150000 non-null float64
v_10                     150000 non-null float64
v_11                     150000 non-null float64
v_12                     150000 non-null float64
v_13                     150000 non-null float64
v_14         

In [46]:
Train_data.head()

Unnamed: 0,SaleID,power,kilometer,price,v_0,v_1,v_2,v_3,v_4,v_5,...,gearbox_nan,bodyType_0.0,bodyType_1.0,bodyType_2.0,bodyType_3.0,bodyType_4.0,bodyType_5.0,bodyType_6.0,bodyType_7.0,bodyType_nan
0,0,60,12.5,1850,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,...,0,0,1,0,0,0,0,0,0,0
1,1,0,15.0,3600,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,...,0,0,0,1,0,0,0,0,0,0
2,2,163,12.5,6222,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,...,0,0,1,0,0,0,0,0,0,0
3,3,193,15.0,2400,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,...,0,1,0,0,0,0,0,0,0,0
4,4,68,5.0,5200,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,...,0,0,1,0,0,0,0,0,0,0


In [47]:
Train_data.columns

Index(['SaleID', 'power', 'kilometer', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'miss_info', 'used_time', 'notRepairedDamage_0.0',
       'notRepairedDamage_1.0', 'notRepairedDamage_nan', 'fuelType_0.0',
       'fuelType_1.0', 'fuelType_2.0', 'fuelType_3.0', 'fuelType_4.0',
       'fuelType_5.0', 'fuelType_6.0', 'fuelType_nan', 'gearbox_0.0',
       'gearbox_1.0', 'gearbox_nan', 'bodyType_0.0', 'bodyType_1.0',
       'bodyType_2.0', 'bodyType_3.0', 'bodyType_4.0', 'bodyType_5.0',
       'bodyType_6.0', 'bodyType_7.0', 'bodyType_nan'],
      dtype='object')

In [49]:
Test_data.columns

Index(['SaleID', 'power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
       'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13',
       'v_14', 'miss_info', 'used_time', 'notRepairedDamage_0.0',
       'notRepairedDamage_1.0', 'notRepairedDamage_nan', 'fuelType_0.0',
       'fuelType_1.0', 'fuelType_2.0', 'fuelType_3.0', 'fuelType_4.0',
       'fuelType_5.0', 'fuelType_6.0', 'fuelType_nan', 'gearbox_0.0',
       'gearbox_1.0', 'gearbox_nan', 'bodyType_0.0', 'bodyType_1.0',
       'bodyType_2.0', 'bodyType_3.0', 'bodyType_4.0', 'bodyType_5.0',
       'bodyType_6.0', 'bodyType_7.0', 'bodyType_nan'],
      dtype='object')

In [55]:
feature_cols = [col for col in numerical_cols if col not in ['SaleID','name','price']]

In [63]:
X_data = Train_data[feature_cols]
Y_data = Train_data['price']

X_test  = Test_data[feature_cols]

In [64]:
X_data.head()

Unnamed: 0,power,kilometer,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,...,gearbox_nan,bodyType_0.0,bodyType_1.0,bodyType_2.0,bodyType_3.0,bodyType_4.0,bodyType_5.0,bodyType_6.0,bodyType_7.0,bodyType_nan
0,60,12.5,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,...,0,0,1,0,0,0,0,0,0,0
1,0,15.0,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,...,0,0,0,1,0,0,0,0,0,0
2,163,12.5,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,0.114912,0.165147,...,0,0,1,0,0,0,0,0,0,0
3,193,15.0,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,...,0,1,0,0,0,0,0,0,0,0
4,68,5.0,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.09188,...,0,0,1,0,0,0,0,0,0,0


In [65]:
Y_data.head()

0    1850
1    3600
2    6222
3    2400
4    5200
Name: price, dtype: int64

In [66]:
x_train,x_val,y_train,y_val = train_test_split(X_data, Y_data, test_size=0.3)

To constuct individual model, here in this notebook I just used several basic models ranging from linear models to nonlinear models, perhaps try catboost or other combinations in the future.

In [None]:
def build_model_lr(x_train,y_train):
    reg_model = linear_model.LinearRegression()
    reg_model.fit(x_train,y_train)
    return reg_model

def build_model_ridge(x_train,y_train):
    reg_model = linear_model.Ridge(alpha=0.8) #alphas=range(1,100,5)
    reg_model.fit(x_train,y_train)
    return reg_model

def build_model_lasso(x_train,y_train):
    reg_model = linear_model.LassoCV()
    reg_model.fit(x_train,y_train)
    return reg_model

def build_model_gbdt(x_train,y_train):
    estimator =GradientBoostingRegressor(loss='ls',subsample= 0.85,max_depth= 5,n_estimators = 100)
    param_grid = { 
            'learning_rate': [0.05,0.08,0.1,0.2],
            }
    gbdt = GridSearchCV(estimator, param_grid,cv=3)
    gbdt.fit(x_train,y_train)
    print(gbdt.best_params_)
    # print(gbdt.best_estimator_ )
    return gbdt

def build_model_xgb(x_train,y_train):
    model = xgb.XGBRegressor(n_estimators=120, learning_rate=0.08, gamma=0, subsample=0.8,\
        colsample_bytree=0.9, max_depth=5) #, objective ='reg:squarederror'
    model.fit(x_train, y_train)
    return model

def build_model_lgb(x_train,y_train):
    estimator = lgb.LGBMRegressor(num_leaves=63,n_estimators = 100)
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
    }
    gbm = GridSearchCV(estimator, param_grid)
    gbm.fit(x_train, y_train)
    return gbm


In [None]:
# use xgb & 5fold cross validation
xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, subsample=0.8,\
        colsample_bytree=0.9, max_depth=7) # ,objective ='reg:squarederror'

scores_train = []
scores = []

sk = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)

for train_ind,val_ind in sk.split(X_data,Y_data):
    
    train_x=X_data.iloc[train_ind].values
    train_y=Y_data.iloc[train_ind]
    val_x=X_data.iloc[val_ind].values
    val_y=Y_data.iloc[val_ind]
    
    xgr.fit(train_x,train_y)
    pred_train_xgb=xgr.predict(train_x)
    pred_xgb=xgr.predict(val_x)
    
    score_train = mean_absolute_error(train_y,pred_train_xgb)
    scores_train.append(score_train)
    score = mean_absolute_error(val_y,pred_xgb)
    scores.append(score)

print('Train mae:',np.mean(score_train))
print('Val mae',np.mean(scores))

In [None]:
## model training 

print('Predict LR...')
model_lr = build_model_lr(x_train,y_train)
val_lr = model_lr.predict(x_val)
subA_lr = model_lr.predict(X_test)

print('Predict Ridge...')
model_ridge = build_model_ridge(x_train,y_train)
val_ridge = model_ridge.predict(x_val)
subA_ridge = model_ridge.predict(X_test)

print('Predict Lasso...')
model_lasso = build_model_lasso(x_train,y_train)
val_lasso = model_lasso.predict(x_val)
subA_lasso = model_lasso.predict(X_test)

print('Predict GBDT...')
model_gbdt = build_model_gbdt(x_train,y_train)
val_gbdt = model_gbdt.predict(x_val)
subA_gbdt = model_gbdt.predict(X_test)


print('predict XGB...')
model_xgb = build_model_xgb(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
subA_xgb = model_xgb.predict(X_test)

print('predict lgb...')
model_lgb = build_model_lgb(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
subA_lgb = model_lgb.predict(X_test)

In [None]:
# compare with weighted averaging
def weighted_method(test_pre1, test_pre2, test_pre3, w=[1/3,1/3,1/3]):
    weighted_result = w[0]*pd.Series(test_pre1)+w[1]*pd.Series(test_pre2)+w[2]*pd.Series(test_pre3)
    return weighted_result

w = [0.3,0.4,0.3]

# test accuracy on validation set
val_pre = weighted_method(val_lgb, val_xgb, val_gbdt, w)
MAE_Weighted = mean_absolute_error(y_val,val_pre)
print('MAE of Weighted of val:',MAE_Weighted)

# test on the predictions
subA = weighted_method(subA_lgb, subA_xgb, subA_gbdt, w)
print('Sta inf:')
Sta_inf(subA)

# save output
sub = pd.DataFrame()
sub['SaleID'] = X_test.index
sub['price'] = subA
sub.to_csv('./sub_Weighted.csv',index=False)

In [67]:
## similarly, we can also use stacking/blending to combine the predicted results together 
## to get the final output to have a better performance