In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model,svm,tree,ensemble,preprocessing,metrics,model_selection,impute
import xgboost as xgb

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_data['SalePrice'] = 0
data = pd.concat([train_data,test_data],axis = 0)
data = data.reset_index().drop(['index','Id'],axis = 1)
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2006,WD,Normal,0
2915,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Abnorml,0
2916,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Abnorml,0
2917,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,0


In [3]:
missing_values = data.isna().sum().sort_values(ascending = False)
missing_list = list(missing_values[missing_values > 0].index)
missing_values[missing_values > 0]

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageFinish     159
GarageQual       159
GarageCond       159
GarageYrBlt      159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
Functional         2
Utilities          2
BsmtHalfBath       2
BsmtFullBath       2
Exterior2nd        1
Exterior1st        1
BsmtUnfSF          1
TotalBsmtSF        1
GarageArea         1
KitchenQual        1
BsmtFinSF2         1
GarageCars         1
BsmtFinSF1         1
SaleType           1
Electrical         1
dtype: int64

In [4]:
data['PoolQC'] = data['PoolQC'].fillna('None')
data['MiscFeature'] = data['MiscFeature'].fillna('None')
data['Alley'] = data['Alley'].fillna('None')
data['Fence'] = data['Fence'].fillna('None')
data['FireplaceQu'] = data['FireplaceQu'].fillna('None')
data['GarageCond'] = data['GarageCond'].fillna('None')
data['GarageFinish'] = data['GarageFinish'].fillna('None')
data['GarageQual'] = data['GarageQual'].fillna('None')
data['GarageType'] = data['GarageType'].fillna('None')
data['BsmtExposure'] = data['BsmtExposure'].fillna('None')
data['BsmtCond'] = data['BsmtCond'].fillna('None')
data['BsmtQual'] = data['BsmtQual'].fillna('None')
data['BsmtFinType1'] = data['BsmtFinType1'].fillna('None')
data['BsmtFinType2'] = data['BsmtFinType2'].fillna('None')
data['MasVnrType'] = data['MasVnrType'].fillna('None')
data['MasVnrArea'] = data['MasVnrArea'].fillna(0)

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    data[col] = data[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    data[col] = data[col].fillna(0)
for col in ('MSZoning','Functional','Utilities','Exterior1st','Exterior2nd','SaleType','Electrical','KitchenQual'):
    data[col] = data[col].fillna(data[col].mode()[0])

data = data.drop(columns = ['LotFrontage'])

In [5]:
data.isna().sum().sort_values() 

MSSubClass     0
GarageType     0
FireplaceQu    0
Fireplaces     0
Functional     0
              ..
MasVnrType     0
Exterior2nd    0
Exterior1st    0
BsmtQual       0
SalePrice      0
Length: 79, dtype: int64

In [6]:
new_data = pd.get_dummies(data)
x = new_data.drop('SalePrice',axis = 1).values[:train_data.shape[0],:]
y = new_data['SalePrice'].values[:train_data.shape[0]]

skf = model_selection.StratifiedKFold(shuffle = True,random_state = 0)
x.shape,y.shape,skf

((1460, 301),
 (1460,),
 StratifiedKFold(n_splits=5, random_state=0, shuffle=True))

In [7]:
LR_model = model_selection.cross_validate(linear_model.LinearRegression(),x,y,cv = 5)
print(LR_model)

model = linear_model.LinearRegression()
model.fit(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

{'fit_time': array([0.18550515, 0.17951655, 0.16156769, 0.12965298, 0.11569118]), 'score_time': array([0.00099659, 0.00200367, 0.00199437, 0.0009973 , 0.        ]), 'test_score': array([0.82410897, 0.81961147, 0.80184095, 0.88741286, 0.65833672])}
0.9352936707520236 0.923780832596351
0.931704004403273 0.9399327001931688
0.9292265851017737 0.9449687500793655
0.9331329236460597 0.9332749294438654
0.9363493239348843 0.9216518674176295




In [8]:
Ridge_model = linear_model.RidgeCV(cv = 5)
Ridge_model.fit(x,y)
Ridge_model.alpha_

10.0

In [9]:
Ridge_model = model_selection.cross_validate(linear_model.Ridge(alpha = 10),x,y,cv = 5)
print(Ridge_model)

model = linear_model.Ridge(alpha = 10)
model.fit(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

{'fit_time': array([0.02593112, 0.02992082, 0.02592969, 0.0269208 , 0.02693677]), 'score_time': array([0.00099683, 0.00199485, 0.00099921, 0.00099969, 0.0019958 ]), 'test_score': array([0.90157467, 0.83254012, 0.87813686, 0.87968302, 0.70958689])}
0.9052396209303593 0.8807044769561214
0.9114373199683077 0.8512006844919211
0.8908625691249126 0.9307758953578293
0.8975722921642435 0.9152552894930733
0.8975846728086448 0.9118663104424184




In [10]:
params = {
    'n_estimators' : [100,300],
    'ccp_alpha' : [0,10000,100000]
}
ensemble.RandomForestRegressor()
Forest_model = model_selection.GridSearchCV(ensemble.RandomForestRegressor(),param_grid = params,cv = 5)
Forest_model.fit(x,y)
Forest_model.cv_results_

{'mean_fit_time': array([ 6.4587491 , 18.08989339,  6.61142359, 18.12377725,  6.16916533,
        17.85263686]),
 'std_fit_time': array([0.71134554, 0.98468136, 0.80182195, 0.87533971, 0.35059167,
        0.42726906]),
 'mean_score_time': array([0.02692752, 0.06464295, 0.0259306 , 0.07899604, 0.02394223,
        0.05926099]),
 'std_score_time': array([0.00418512, 0.00297666, 0.00572684, 0.01470588, 0.00324161,
        0.00233535]),
 'param_ccp_alpha': masked_array(data=[0, 0, 10000, 10000, 100000, 100000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 300, 100, 300, 100, 300],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ccp_alpha': 0, 'n_estimators': 100},
  {'ccp_alpha': 0, 'n_estimators': 300},
  {'ccp_alpha': 10000, 'n_estimators': 100},
  {'ccp_alpha': 10000, 'n_estimators': 300},
  {

In [11]:
Forest_model = model_selection.cross_validate(ensemble.RandomForestRegressor(n_estimators = 100,criterion = 'mse',ccp_alpha = 0),x,y,cv = 5)
Forest_model

{'fit_time': array([5.47740126, 6.1446166 , 6.57242751, 7.70141172, 6.31018424]),
 'score_time': array([0.0219419 , 0.02193618, 0.03590512, 0.03590298, 0.02392983]),
 'test_score': array([0.87615371, 0.85268851, 0.86950309, 0.87695243, 0.8138749 ])}

In [12]:
params = {
    'n_estimators' : [300],
    'max_depth' : [2,3]
}

ensemble.GradientBoostingRegressor()
GB_model = model_selection.GridSearchCV(ensemble.GradientBoostingRegressor(),param_grid = params,cv = 5)
GB_model.fit(x,y)
print(GB_model.cv_results_)

model = ensemble.GradientBoostingRegressor(n_estimators = 300,max_depth = 2)
model.fit(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

{'mean_fit_time': array([5.04875822, 7.04572959]), 'std_fit_time': array([0.42331307, 0.64376461]), 'mean_score_time': array([0.0041801 , 0.00439062]), 'std_score_time': array([0.00097931, 0.00080376]), 'param_max_depth': masked_array(data=[2, 3],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[300, 300],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 2, 'n_estimators': 300}, {'max_depth': 3, 'n_estimators': 300}], 'split0_test_score': array([0.9041614 , 0.90678225]), 'split1_test_score': array([0.84878293, 0.83224467]), 'split2_test_score': array([0.90692946, 0.90985577]), 'split3_test_score': array([0.90874521, 0.91242891]), 'split4_test_score': array([0.92158158, 0.90781622]), 'mean_test_score': array([0.89804012, 0.89382557]), 'std_test_score': array([0.02534388, 0.03085084]), 'rank_test_score': array([1, 2])}
0.9692068234588965 0.96198601572594



In [13]:
# xgb.XGBRegressor()

In [14]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 200),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [15]:
estimators = [
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = linear_model.LinearRegression(),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

model_1 = model_selection.cross_validate(model,x,y,cv = 5)
print(model_1)

0.9821658603833068
0.9827939507385239 0.9793715393363175
0.9825328801131044 0.9804517271044414
0.9809782285073617 0.9857261463945577
0.982425487166938 0.9809235330794452
0.9819856095071144 0.9827779714251094




{'fit_time': array([33.14120817, 31.75870562, 32.46113944, 31.74668169, 32.65366054]), 'score_time': array([0.00897765, 0.00498247, 0.00499392, 0.00498939, 0.00498366]), 'test_score': array([0.90882077, 0.85526209, 0.90597889, 0.91145177, 0.85152996])}


In [16]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [17]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [18]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [19]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [20]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 200),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [21]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [22]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = tree.DecisionTreeRegressor(),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [23]:
estimators = [
    ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = linear_model.LinearRegression(),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

CV = model_selection.cross_validate(model,x,y,cv = 5)
print(CV)



0.9818142392845777




0.9824340786446034 0.979056487808399
0.9821800173509152 0.9801054680216147
0.9806059012985923 0.9854362304102499
0.9821044613213799 0.9804289232437806
0.9816314567977019 0.9824347425083246
{'fit_time': array([125.27425694, 111.70772862,  65.94384456, 102.91580486,
        96.88926435]), 'score_time': array([0.12473655, 0.06248569, 0.08677006, 0.05383492, 0.02888536]), 'test_score': array([0.90803676, 0.83917184, 0.90295657, 0.90673751, 0.83685076])}


In [24]:
x_test = new_data.drop('SalePrice',axis = 1).values[train_data.shape[0]:,:]
y_pred = model.predict(x_test)

submission = pd.DataFrame({
    'Id' : range(1461,1461 + 1459),
    'SalePrice' : y_pred
})
submission
submission.to_csv('submission.csv',index = False)

In [25]:
new_data = data.drop(columns = ['Street','Alley','Utilities','LandSlope','Condition2',
                            'RoofMatl','ExterCond','Heating','CentralAir','Electrical',
                            'LowQualFinSF','BsmtFullBath','BsmtHalfBath','HalfBath',
                            'BedroomAbvGr','KitchenAbvGr','Functional','FireplaceQu',
                            'GarageType','GarageYrBlt','GarageFinish','GarageQual',
                            'GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF',
                            'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','PoolQC',
                            'Fence','MiscFeature','MiscVal','MoSold','SaleType','SaleCondition',
                            'MSSubClass','LotArea','GrLivArea','TotRmsAbvGrd',
                            'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','GarageArea','YearRemodAdd',
                            'OverallCond'])

def function(x):
    if x == 'None':
        return 0
    elif x == 'Po':
        return 1
    elif x == 'Fa':
        return 2
    elif x == 'TA':
        return 3
    elif x == 'Gd':
        return 4
    elif x == 'Ex':
        return 5
    
new_data['TotalFlrSF'] = new_data['2ndFlrSF'] + new_data['1stFlrSF']
new_data['ExterQual_rank'] = new_data['ExterQual'].apply(function)
new_data['BsmtQual_rank'] = new_data['BsmtQual'].apply(function)
new_data['KitchenQual_rank'] = new_data['KitchenQual'].apply(function)

new_data = new_data.drop(columns = ['1stFlrSF','2ndFlrSF','KitchenQual','BsmtQual','ExterQual'])
new_data = pd.get_dummies(new_data)

x = new_data.drop('SalePrice',axis = 1).values[:train_data.shape[0],:]
y = new_data['SalePrice'].values[:train_data.shape[0]]
x.shape,y.shape

((1460, 153), (1460,))

In [26]:
LR_model = model_selection.cross_validate(linear_model.LinearRegression(),x,y,cv = 5)
print(LR_model)

model = linear_model.LinearRegression()
model.fit(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

{'fit_time': array([0.07369518, 0.05784369, 0.05077553, 0.04787254, 0.04685545]), 'score_time': array([0.0009985 , 0.00099754, 0.0009973 , 0.        , 0.00099778]), 'test_score': array([0.86850548, 0.81147249, 0.83362757, 0.82721198, 0.73896047])}
0.8746492671698336 0.8532318792071826
0.8805008922257005 0.82540322925227
0.8571467703799964 0.9120813179465227
0.8696273436728824 0.8755470537768065
0.8705095480605844 0.8712124778989483




In [27]:
Lasso_model = linear_model.LassoCV(cv = 5)
Lasso_model.fit(x,y)
Lasso_model.alpha_

29660.447248571938

In [28]:
Ridge_model = linear_model.RidgeCV(cv = 5)
Ridge_model.fit(x,y)
Ridge_model.alpha_

10.0

In [29]:
Lasso_model = model_selection.cross_validate(linear_model.Lasso(alpha = 10),x,y,cv = 5)
Lasso_model,Lasso_model['test_score'].mean()

({'fit_time': array([0.0738852 , 0.0676198 , 0.06385303, 0.06492043, 0.0534966 ]),
  'score_time': array([0.00099754, 0.00099754, 0.00099826, 0.0009973 , 0.        ]),
  'test_score': array([0.87089518, 0.81598122, 0.84102397, 0.83253985, 0.74393762])},
 0.8208755693469074)

In [30]:
Ridge_model = model_selection.cross_validate(linear_model.Ridge(alpha = 10),x,y,cv = 5)
Ridge_model

{'fit_time': array([0.00697875, 0.00685787, 0.00673318, 0.00698256, 0.00699019]),
 'score_time': array([0.00099707, 0.00099683, 0.00100017, 0.00099611, 0.00099707]),
 'test_score': array([0.87783017, 0.82309797, 0.85238789, 0.84796611, 0.7480761 ])}

In [31]:
Lasso_model = model_selection.cross_validate(linear_model.Lasso(alpha = 29660.4472),x,y,cv = 5)
Lasso_model,Lasso_model['test_score'].mean()

({'fit_time': array([0.00997806, 0.00893903, 0.00902176, 0.00897694, 0.00861454]),
  'score_time': array([0.0010004 , 0.00099587, 0.00095129, 0.00199294, 0.00103736]),
  'test_score': array([0.72836929, 0.70573376, 0.76329635, 0.73839179, 0.51761279])},
 0.6906807937184885)

In [32]:
Forest_model = model_selection.cross_validate(ensemble.RandomForestRegressor(n_estimators = 100,criterion = 'mse',ccp_alpha = 0),x,y,cv = 5)
Forest_model

{'fit_time': array([1.30587935, 1.27998376, 1.29908848, 1.31031466, 1.28456545]),
 'score_time': array([0.00996947, 0.00996923, 0.01107574, 0.00997376, 0.00997376]),
 'test_score': array([0.85960077, 0.82265345, 0.87094106, 0.89852835, 0.81130807])}

In [33]:
params = {
    'n_estimators' : [100,300],
    'ccp_alpha' : [0,10000,100000]
}
ensemble.RandomForestRegressor()
Forest_model = model_selection.GridSearchCV(ensemble.RandomForestRegressor(),param_grid = params,cv = 5)
Forest_model.fit(x,y)
Forest_model.cv_results_

{'mean_fit_time': array([1.29135933, 4.96015468, 3.2345222 , 8.57043791, 3.13033257,
        9.18152623]),
 'std_fit_time': array([0.00919978, 1.69552539, 0.08908359, 0.05153756, 0.23132549,
        0.43968332]),
 'mean_score_time': array([0.00997124, 0.04668202, 0.02274261, 0.05844259, 0.02254248,
        0.0591526 ]),
 'std_score_time': array([1.98102742e-06, 1.72904122e-02, 1.16786346e-03, 1.01810833e-03,
        2.79144021e-03, 4.95348235e-03]),
 'param_ccp_alpha': masked_array(data=[0, 0, 10000, 10000, 100000, 100000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 300, 100, 300, 100, 300],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ccp_alpha': 0, 'n_estimators': 100},
  {'ccp_alpha': 0, 'n_estimators': 300},
  {'ccp_alpha': 10000, 'n_estimators': 100},
  {'ccp_alpha': 10000, 'n_esti

In [34]:
params = {
    'n_estimators' : [100,300],
    'max_depth' : [2,3,4,5]
}
ensemble.GradientBoostingRegressor()
GB_model = model_selection.GridSearchCV(ensemble.GradientBoostingRegressor(),param_grid = params,cv = 5)
GB_model.fit(x,y)
GB_model.cv_results_

{'mean_fit_time': array([0.79357982, 2.34072123, 0.96136599, 2.87859759, 1.24407153,
        3.75936108, 1.19241152, 2.1350997 ]),
 'std_fit_time': array([0.01369357, 0.03761995, 0.00488559, 0.01935829, 0.00573357,
        0.01299936, 0.3970802 , 0.01306478]),
 'mean_score_time': array([0.00218482, 0.00337968, 0.00219421, 0.00399017, 0.00239468,
        0.00418787, 0.00219455, 0.00259213]),
 'std_score_time': array([3.85214113e-04, 4.75956022e-04, 4.00642979e-04, 3.98950589e-07,
        4.89103800e-04, 4.00167510e-04, 7.46697462e-04, 4.89337564e-04]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4, 5, 5],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 300, 100, 300, 100, 300, 100, 300],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'n_estim

In [35]:
model = ensemble.GradientBoostingRegressor(max_depth = 2,n_estimators = 300)
model.fit(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))
    

0.9534299182166313 0.9509597158079481
0.9531789013705366 0.9520005269540724
0.9503815980708948 0.9606891580510674
0.9541577701393864 0.9473993997472412
0.9535271567666224 0.9509081061596868




In [36]:
estimators = [
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 200),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

0.9147453945948149
0.908922212787362 0.9405930389242156
0.9099781843782027 0.9366740129994813
0.9019697621411668 0.9540075706427469
0.9061191208579646 0.9547013810228653
0.9471938642299361 0.797878304463098




In [37]:
estimators = [
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = linear_model.LinearRegression(),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

0.9683116049949786
0.9693641894522642 0.9636283199904485
0.9686374069768232 0.9667731520087274
0.9664110512293217 0.9739806335338445
0.969137447994409 0.9644087119822913
0.9678141794341643 0.9700363077185347




In [38]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [39]:
# estimators = [
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [40]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [41]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 200),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [42]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [43]:
# estimators = [
#     ('tree',ensemble.RandomForestRegressor(ccp_alpha = 0,n_estimators = 100)),
#     ('ridge',linear_model.Ridge(alpha = 10)),
#     ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
#              ]
# model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 300),cv = 5)
# model.fit(x,y)
# model.score(x,y)

In [44]:
estimators = [
    ('tree',ensemble.RandomForestRegressor(ccp_alpha = 0,n_estimators = 100)),
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

0.9084730765265879
0.9009396820554096 0.9419163594116438




0.9077990589983124 0.9114891627306732
0.8994394305526595 0.9359529781790523
0.8976952007111151 0.9584242786231904
0.9370441632442617 0.8055375092855344


In [45]:
estimators = [
    ('tree',ensemble.RandomForestRegressor(ccp_alpha = 0,n_estimators = 100)),
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = linear_model.LinearRegression(),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

0.9692062182524153
0.970220753536924 0.9646921669261959




0.9694262735323319 0.9681571434513786
0.9674122196709967 0.9745494394056848
0.970075666790088 0.965102252752526
0.968711428559339 0.97092325374439


In [46]:
estimators = [
    ('tree',ensemble.RandomForestRegressor(ccp_alpha = 100000,n_estimators = 300)),
    ('ridge',linear_model.Ridge(alpha = 10)),
    ('gb',ensemble.GradientBoostingRegressor(n_estimators = 300))
             ]
model = ensemble.StackingRegressor(estimators = estimators,final_estimator = linear_model.LinearRegression(),cv = 5)
model.fit(x,y)
print(model.score(x,y))

for train_index,test_index in skf.split(x,y):
    x_train,x_test = x[train_index],x[test_index]
    y_train,y_test = y[train_index],y[test_index]
    print(model.score(x_train,y_train), model.score(x_test,y_test))

0.9696782948480677




0.9706733782702643 0.9652507642610789
0.9697707250711649 0.9692192029667819
0.9679635720012448 0.974777370495784
0.9706134302318803 0.9652697296024888
0.9691925323267438 0.971363822174337
