In [19]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [20]:
data_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
data_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [21]:
pd.set_option('display.max_columns', None)                                                     # Показывает (не скрывает) все столбцы датафрэйма

In [22]:
def summary(text, df):                                                                         # Функция выдает информацию по датафрэйму
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    return summ

In [23]:
data_train_int = data_train.select_dtypes('int64')                                                      # Разделяем train и test выборки по типам
data_train_float = data_train.select_dtypes('float64')
data_train_obj = data_train.select_dtypes('object')
data_train_obj = data_train_obj.drop(['MiscFeature', 'PoolQC', 'Fence', 'Alley'], axis = 1)             # Отбрасываем столбцы с большим числом пропусков
data_train_float = data_train_float.fillna({'LotFrontage':data_train_float.LotFrontage.mean(),          # Заполняем пропуски
                                            'GarageYrBlt':data_train_float.GarageYrBlt.median(),
                                            'MasVnrArea':data_train_float.MasVnrArea.median()})

data_test_int = data_test.select_dtypes('int64')
data_test_float = data_test.select_dtypes('float64')
data_test_obj = data_test.select_dtypes('object')
data_test_obj = data_test_obj.drop(['MiscFeature', 'PoolQC', 'Fence', 'Alley'], axis = 1)
data_test_float = data_test_float.fillna({'LotFrontage':data_test_float.LotFrontage.mean(), 
                                          'GarageYrBlt':data_test_float.GarageYrBlt.median(),
                                          'MasVnrArea':data_test_float.MasVnrArea.median()})

In [24]:
column_names = ['GarageCond','GarageFinish','GarageQual','GarageType','Electrical','BsmtFinType2','BsmtFinType1',
                'BsmtExposure','BsmtCond','BsmtQual','MasVnrType']

for i in range(0, len(column_names)):                                                                                        # Заполняем пропуски
    data_train_obj[column_names[i]] = data_train_obj[column_names[i]].fillna(method='ffill').fillna(method='bfill')
    data_test_obj[column_names[i]] = data_test_obj[column_names[i]].fillna(method='ffill').fillna(method='bfill')

In [25]:
frames1 = [data_train_obj, data_train_int, data_train_float]                                               # Объединяем обратно в train и test датафрэймы
frames2 = [data_test_obj, data_test_int, data_test_float]
data_train = pd.concat(frames1, axis = 1)
data_test = pd.concat(frames2, axis = 1)

In [26]:
abs(data_train.corr()).sort_values('SalePrice', ascending=False)                                         # Таблица корреляции таргета с фичами по убыванию

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,LotFrontage,MasVnrArea,GarageYrBlt
SalePrice,0.021917,0.084284,0.263843,0.790982,0.077856,0.522897,0.507101,0.38642,0.011378,0.214479,0.613581,0.605852,0.319334,0.025606,0.708624,0.227122,0.016844,0.560664,0.284108,0.168213,0.135907,0.533723,0.466929,0.640409,0.623431,0.324413,0.315856,0.128578,0.044584,0.111447,0.092404,0.02119,0.046432,0.028923,1.0,0.334901,0.472614,0.466754
OverallQual,0.028365,0.032628,0.105806,1.0,0.091932,0.572323,0.550684,0.239666,0.059119,0.308159,0.537808,0.476224,0.295493,0.030429,0.593007,0.111098,0.04015,0.5506,0.273458,0.101676,0.183882,0.427452,0.396765,0.600671,0.562022,0.238923,0.308819,0.113937,0.030371,0.064886,0.065166,0.031406,0.070815,0.027347,0.790982,0.234196,0.407252,0.514231
GrLivArea,0.008273,0.074853,0.263116,0.593007,0.079686,0.19901,0.287389,0.208171,0.00964,0.240257,0.454868,0.566024,0.687501,0.134683,1.0,0.034836,0.018918,0.630012,0.415772,0.52127,0.100063,0.825489,0.461679,0.467247,0.468997,0.247433,0.330224,0.009113,0.020643,0.10151,0.170205,0.002416,0.05024,0.036526,0.708624,0.368392,0.388052,0.219801
GarageCars,0.01657,0.04011,0.154871,0.600671,0.185758,0.53785,0.420622,0.224054,0.038264,0.214175,0.434585,0.439317,0.183926,0.09448,0.467247,0.131881,0.020891,0.469672,0.219178,0.086106,0.050634,0.362289,0.300789,1.0,0.882475,0.226342,0.213569,0.151434,0.035765,0.050494,0.020934,0.04308,0.040522,0.039117,0.640409,0.269729,0.361945,0.474313
GarageArea,0.017634,0.098672,0.180403,0.562022,0.151521,0.478954,0.3716,0.29697,0.018227,0.183303,0.486665,0.489782,0.138347,0.067601,0.468997,0.179189,0.024536,0.405656,0.163549,0.065253,0.064433,0.337822,0.269141,0.882475,1.0,0.224666,0.241435,0.121777,0.035087,0.051412,0.061047,0.0274,0.027974,0.027378,0.623431,0.323663,0.370884,0.468804
TotalBsmtSF,0.015415,0.238518,0.260833,0.537808,0.171098,0.391452,0.291066,0.522396,0.10481,0.41536,1.0,0.81953,0.174512,0.033245,0.454868,0.307351,0.000315,0.323722,0.048804,0.05045,0.068901,0.285573,0.339519,0.434585,0.486665,0.232019,0.247264,0.095478,0.037384,0.084489,0.126053,0.018479,0.013196,0.014969,0.613581,0.363358,0.360067,0.309386
1stFlrSF,0.010496,0.251758,0.299475,0.476224,0.144203,0.281986,0.240379,0.445863,0.097117,0.317987,0.81953,1.0,0.202646,0.014241,0.566024,0.244671,0.001956,0.380637,0.119916,0.127401,0.068101,0.409516,0.410531,0.439317,0.489782,0.235459,0.211671,0.065292,0.056104,0.088758,0.131525,0.021096,0.031372,0.013604,0.605852,0.414266,0.33985,0.224897
FullBath,0.005587,0.131608,0.126031,0.5506,0.194149,0.468271,0.439046,0.058543,0.076444,0.288886,0.323722,0.380637,0.421378,0.00071,0.630012,0.064512,0.054536,1.0,0.136381,0.363252,0.133115,0.554784,0.243671,0.469672,0.405656,0.187703,0.259977,0.115093,0.035353,0.008106,0.049604,0.01429,0.055872,0.019669,0.560664,0.180424,0.272999,0.466809
TotRmsAbvGrd,0.027239,0.04038,0.190015,0.427452,0.057583,0.095589,0.19174,0.044316,0.035227,0.250647,0.285573,0.409516,0.616423,0.131185,0.825489,0.053275,0.023836,0.554784,0.343415,0.67662,0.256045,1.0,0.326114,0.362289,0.337822,0.165984,0.234192,0.004151,0.006683,0.059383,0.083757,0.024763,0.036907,0.034516,0.533723,0.320146,0.279568,0.139519
YearBuilt,0.012713,0.02785,0.014228,0.572323,0.375983,1.0,0.592855,0.249503,0.049107,0.14904,0.391452,0.281986,0.010308,0.183784,0.19901,0.187599,0.038162,0.468271,0.242656,0.070651,0.1748,0.095589,0.147716,0.53785,0.478954,0.22488,0.188686,0.387268,0.031355,0.050364,0.00495,0.034383,0.012398,0.013618,0.522897,0.117598,0.3116,0.777182


In [27]:
# Отбрасываем фичи ниже порога 0.05
data_train = data_train.drop(['BsmtFinSF2','BsmtHalfBath','MiscVal','Id','LowQualFinSF','YrSold','3SsnPorch','MoSold','Neighborhood','Exterior1st',
                              'Exterior2nd'], axis=1)
data_test = data_test.drop(['BsmtFinSF2','BsmtHalfBath','MiscVal','LowQualFinSF','YrSold','3SsnPorch','MoSold','Neighborhood','Exterior1st',
                            'Exterior2nd'], axis=1)

In [28]:
cleanup_nums = {"MSZoning":     {"RL": 1, "RM": 2, "C (all)": 3, "FV": 4, "RH": 5},                           # Замена данных столбцов с колл. переменными
                "Street":       {"Pave": 1, "Grvl": 2},
                "LotShape":     {"Reg": 1, "IR1": 2, "IR2": 3, "IR3": 4},
                "LandContour":  {"Lvl": 1, "Bnk": 2, "Low": 3, "HLS": 4},
                "Utilities":    {"AllPub": 1, "NoSeWa": 2},
                "LotConfig":    {"Inside": 1, "FR2": 2, "Corner": 3, "CulDSac": 4, "FR3": 5},
                "LandSlope":    {"Gtl": 1, "Mod": 2, "Sev": 3},
                "Condition1":   {"Norm": 1, "Feedr": 2, "PosN": 3, "Artery": 4, "RRAe": 5, "RRNn": 6, "RRAn": 7, "PosA": 8, "RRNe": 9},
                "Condition2":   {"Norm": 1, "Artery": 2, "RRNn": 3, "Feedr": 4, "PosN": 5, "PosA": 6, "RRAn": 7, "RRAe": 8 },
                "BldgType":     {"1Fam": 1, "2fmCon": 2, "Duplex": 3, "TwnhsE": 4, "Twnhs": 5},
                "HouseStyle":   {"2Story": 1, "1Story": 2, "1.5Fin": 3, "1.5Unf": 4, "SFoyer": 5, "SLvl": 6, "2.5Unf": 7, "2.5Fin": 8},
                "RoofStyle":    {"Gable": 1, "Hip": 2, "Gambrel": 3, "Mansard": 4, "Flat": 5, "Shed": 6},
                "RoofMatl":     {"CompShg": 1, "WdShngl": 2, "Metal": 3, "WdShake": 4, "Membran": 5, "Tar&Grv": 6, "Roll": 7, "ClyTile": 8},
                "MasVnrType":   {"BrkFace": 1, "None": 2, "Stone": 3, "BrkCmn": 4},
                "ExterQual":    {"Gd": 1, "TA": 2, "Ex": 3, "Fa": 4},
                "ExterCond":    {"TA": 1, "Gd": 2, "Fa": 3, "Po": 4, "Ex": 5},
                "Foundation":   {"PConc": 1, "CBlock": 2, "BrkTil": 3, "Wood": 4, "Slab": 5, "Stone": 6},
                "BsmtQual":     {"Gd": 1, "TA": 2, "Ex": 3, "Fa": 4},
                "BsmtCond":     {"TA": 1, "Gd": 2, "Fa": 3, "Po": 4},
                "BsmtExposure": {"No": 1, "Gd": 2, "Mn": 3, "Av": 4},
                "BsmtFinType1": {"GLQ": 1, "ALQ": 2, "Unf": 3, "Rec": 4, "BLQ": 5, "LwQ": 6},
                "BsmtFinType2": {"Unf": 1, "BLQ": 2, "ALQ": 3, "Rec": 4, "LwQ": 5, "GLQ": 6},
                "Heating":      {"GasA": 1, "GasW": 2, "Grav": 3, "Wall": 4, "OthW": 5, "Floor": 6},
                "HeatingQC":    {"Ex": 1, "Gd": 2, "TA": 3, "Fa": 4, "Po": 5},
                "CentralAir":   {"Y": 1, "N": 2},
                "Electrical":   {"SBrkr": 1, "FuseF": 2, "FuseA": 3, "FuseP": 4, "Mix": 5},
                "KitchenQual":  {"Gd": 1, "TA": 2, "Ex": 3, "Fa": 4},
                "Functional":   {"Typ": 1, "Min1": 2, "Maj1": 3, "Min2": 4, "Mod": 5, "Maj2": 6, "Sev": 7},
                "FireplaceQu":  {"TA": 1, "Gd": 2, "Fa": 3, "Ex": 4, "Po": 5},
                "GarageType":   {"Attchd": 1, "Detchd": 2, "BuiltIn": 3, "CarPort": 4, "Basment": 5, "2Types": 6},
                "GarageFinish": {"RFn": 1, "Unf": 2, "Fin": 3},
                "GarageQual":   {"TA": 1, "Fa": 2, "Gd": 3, "Ex": 4, "Po": 5},
                "GarageCond":   {"TA": 1, "Fa": 2, "Gd": 3, "Ex": 4, "Po": 5},
                "PavedDrive":   {"Y": 1, "N": 2, "P": 3},
                "SaleType":     {"WD": 1, "New": 2, "COD": 3, "ConLD": 4, "ConLI": 5, "CWD": 6, "ConLw": 7, "Con": 8, "Oth": 9},
                "SaleCondition":{"Normal": 1, "Abnorml": 2, "Partial": 3, "AdjLand": 4, "Alloca": 5, "Family": 6}}

data_train = data_train.replace(cleanup_nums)
data_test = data_test.replace(cleanup_nums)
data_test = data_test.fillna(1)                                                                                           # Последние заполнения пропусков
data_train['FireplaceQu'] = data_train['FireplaceQu'].fillna(data_train['FireplaceQu'].median())

In [29]:
# Убираем фичи с низкой корреляцией от таргета
data_train = data_train.drop(['BsmtQual','Condition2','Utilities','MasVnrType','RoofMatl','Condition1','Street','FireplaceQu','BsmtCond','PoolArea',
                              'LandContour','MSSubClass','OverallCond','SaleType','GarageQual','BsmtFinType2','LandSlope'], axis=1)

data_test = data_test.drop(['BsmtQual','Condition2','Utilities','MasVnrType','RoofMatl','Condition1','Street','FireplaceQu','BsmtCond','PoolArea',
                              'LandContour','MSSubClass','OverallCond','SaleType','GarageQual','BsmtFinType2','LandSlope'], axis=1)

In [30]:
data_train = data_train.drop([1458], axis=0)                                                         
data_train['SalePrice_pred'] = data_train['SalePrice'].mean()                          # Заводим столбец со средним значением таргета (наше предсказание)

In [31]:
n, nu = 100, 0.1       # Кол-во деревьев и learning rate
trees = []             # Список из деревьев

for i in range(n):
    data_train['Residual'] = data_train['SalePrice'] - data_train['SalePrice_pred']    # Заводим столбец со значением ошибки предсказ. значения с реальным
    tree = DecisionTreeRegressor(max_depth=5)                                          # Посадили дерево с глубиной=5 и обучаем на фичах и ошибке  
    tree.fit(data_train[['MSZoning','LotShape','LotConfig','BldgType','HouseStyle','RoofStyle','ExterQual','GarageArea',
                         'ExterCond','Foundation','BsmtExposure','BsmtFinType1','Heating','HeatingQC','CentralAir',
                         'Electrical','KitchenQual','Functional','GarageType','GarageFinish','GarageCond','PavedDrive',
                         'SaleCondition','LotArea','OverallQual','YearBuilt','YearRemodAdd','1stFlrSF','2ndFlrSF',
                         'GrLivArea','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
                         'WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch','LotFrontage','MasVnrArea','BsmtFinSF1',
                         'BsmtUnfSF','TotalBsmtSF','BsmtFullBath','GarageYrBlt','GarageCars']], data_train['Residual'])
    
    data_train['SalePrice_pred'] += nu*tree.predict(data_train[['MSZoning','LotShape','LotConfig','BldgType','HouseStyle',    # Коррект. предсказ.значение
                         'RoofStyle','ExterQual','GarageArea','ExterCond','Foundation','BsmtExposure','BsmtFinType1','Heating',
                         'HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageType','GarageFinish',
                         'GarageCond','PavedDrive','SaleCondition','LotArea','OverallQual','YearBuilt','YearRemodAdd',
                         '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd',
                         'Fireplaces','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch','LotFrontage','MasVnrArea',
                         'BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','GarageYrBlt','GarageCars']])
    trees.append(tree)                                                                                               # Добавляем  деревья в список
    print(mean_absolute_error(data_train['SalePrice'], data_train['SalePrice_pred']))                       # Выводим среднюю ошибку, наблюдаем уменьшение

52592.725720881346
48130.72560960813
44144.60486702908
40535.22360845679
37303.70373000639
34413.79786483729
31870.588028832197
29511.93947085328
27406.227136067246
25491.969736966017
23837.569174031058
22329.91213822445
21003.7344617623
19820.432415783376
18748.518894863748
17774.93334483388
16975.62935242314
16160.922872930056
15441.169894852124
14787.120741433706
14169.091101307638
13584.037505266067
13096.64983333493
12666.494274991855
12297.728697908185
11952.58635557789
11648.511878458396
11395.39460127004
11177.128661002422
10957.405128631555
10733.711694096643
10561.110115431246
10388.665994446641
10214.798116074347
10076.909015978677
9935.530710645646
9794.665545801272
9662.03914533542
9527.752937099236
9428.392659025765
9285.851264191386
9200.517247046828
9116.122791494061
9009.000590820091
8948.266118937054
8851.85455587029
8779.047422434061
8700.329773156192
8641.527385916599
8577.705202035939
8525.581951365952
8416.523273698567
8355.80645123159
8294.827862970884
8235.49607

In [32]:
data_test['SalePrice_pred'] = data_train['SalePrice'].mean()

for tree in trees:                                                                                                          # Применяем к тестовой выборке
    data_test['SalePrice_pred'] += nu*tree.predict(data_test[['MSZoning','LotShape','LotConfig','BldgType','HouseStyle',
                         'RoofStyle','ExterQual','GarageArea','ExterCond','Foundation','BsmtExposure','BsmtFinType1','Heating',
                         'HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageType','GarageFinish',
                         'GarageCond','PavedDrive','SaleCondition','LotArea','OverallQual','YearBuilt','YearRemodAdd',
                         '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd',
                         'Fireplaces','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch','LotFrontage','MasVnrArea',
                         'BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','GarageYrBlt','GarageCars']])

In [33]:
df_submit = pd.DataFrame({'Id': data_test['Id'],
                          'SalePrice': data_test['SalePrice_pred']})

In [34]:
df_submit

Unnamed: 0,Id,SalePrice
0,1461,117324.046298
1,1462,155217.034110
2,1463,184396.815509
3,1464,183858.764459
4,1465,193710.502018
...,...,...
1454,2915,80249.642206
1455,2916,86314.716744
1456,2917,164772.410141
1457,2918,117032.107044
