In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [2]:
data = pd.read_csv('train-2.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train = pd.read_csv('train-2.csv')
test = pd.read_csv('test.csv')

In [4]:
sampsub = pd.read_csv('samplesub.csv')['SalePrice']
sampsub.shape

(1459,)

In [5]:
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

In [6]:
y_train = train.SalePrice.values
data = pd.concat((train, test)).reset_index(drop=True)
data.drop(['SalePrice'], axis=1, inplace=True)

In [7]:
# находим кол-венные признаки.
num_feat = data.select_dtypes(include=[np.number])
num_feat.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,548.0,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,460.0,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,608.0,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,642.0,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,836.0,192,84,0,0,0,0,0,12,2008


In [8]:
pd.DataFrame(num_feat.isnull().sum().sort_values(ascending=False)[:10])

Unnamed: 0,0
LotFrontage,486
GarageYrBlt,159
MasVnrArea,23
BsmtHalfBath,2
BsmtFullBath,2
GarageArea,1
BsmtFinSF1,1
BsmtFinSF2,1
BsmtUnfSF,1
TotalBsmtSF,1


In [9]:
# заполняем пропуски медианами.
num_feat = num_feat.fillna(num_feat.median())

In [10]:
# категориальные признаки.
cat_feat = data.select_dtypes(exclude = [np.number])
cat_feat.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [11]:
# проверяем наличие пробелов.
pd.DataFrame(cat_feat.isnull().sum().sort_values(ascending=False)[:20])

Unnamed: 0,0
PoolQC,2909
MiscFeature,2814
Alley,2721
Fence,2348
FireplaceQu,1420
GarageCond,159
GarageQual,159
GarageFinish,159
GarageType,157
BsmtCond,82


In [12]:
# Данные по некоторым признакам отсутствуют более чем на 50%, 
# так что от них можно избавиться.
data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis=1, inplace=True)

In [13]:
# заполняем пробелы на значение 'unknown'.
for cat in cat_feat:
    cat_feat[cat].fillna('unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [14]:
cat_feat = pd.get_dummies(cat_feat)

In [15]:
data = num_feat.join(cat_feat)

In [16]:
# пробелов нет.
data.isnull().sum().sum()

0

In [17]:
X_train = data[:1460]
y_train = train['SalePrice']
X_test = data[1460:]
y_test = sampsub

In [21]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [22]:
# строим randomforest.
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=4, random_state=42)
model.fit(X_train_std, y_train)

RandomForestRegressor(max_depth=4, random_state=42)

In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=10)
results = cross_val_score(model, X_train_std, y_train, cv=kf)
results.std(), results.mean()

(0.037313100013145896, 0.8003448135047064)

In [24]:
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import StackingRegressor

In [58]:
# обучаем стекинг.
estimators = [('ECV', ElasticNetCV()), ('lr', RidgeCV()),('svr', LinearSVR(random_state=42))]
reg = StackingRegressor(estimators=estimators, 
final_estimator= RandomForestRegressor(n_estimators=10, random_state=42))

In [50]:
results2 = cross_val_score(reg, X_train_std, y_train, cv=kf)
results2.std(), results2.mean()

(0.05296700661293405, 0.8378730660405012)

In [59]:
reg.fit(X_train, y_train).score(X_test, y_test)



-19.877094242334557

In [None]:
# не могу понять, в чем ошибка.