In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [2]:
data = pd.read_csv('train-2.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# находим кол-венные признаки.
num_feat = data.select_dtypes(include=[np.number])
num_feat.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [4]:
# проверяем наличие пробелов.
pd.DataFrame(num_feat.isnull().sum().sort_values(ascending=False)[:10])

Unnamed: 0,0
LotFrontage,259
GarageYrBlt,81
MasVnrArea,8
BsmtFinSF1,0
LowQualFinSF,0
2ndFlrSF,0
1stFlrSF,0
TotalBsmtSF,0
BsmtUnfSF,0
BsmtFinSF2,0


In [74]:
# заполняем пропуски медианами.
data = data.fillna(data.median())

In [6]:
# категориальные признаки.
cat_feat = data.select_dtypes(exclude = [np.number])
cat_feat.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [7]:
# проверяем наличие пробелов.
pd.DataFrame(cat_feat.isnull().sum().sort_values(ascending=False)[:20])

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
GarageCond,81
GarageQual,81
GarageFinish,81
GarageType,81
BsmtFinType2,38


In [8]:
# Данные по некоторым признакам отсутствуют более чем на 50%, 
# так что от них можно избавиться.
cat_feat.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
# заполняем пробелы на значение 'unknown'.
for cat in cat_feat:
    cat_feat[cat].fillna('unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [27]:
cat_feat = pd.get_dummies(cat_feat)

In [30]:
df = num_feat.join(cat_feat)

In [26]:
# пробелов нет.
df.isnull().sum().sum()

0

In [44]:
# удаляем 'ID'.
df = df.drop('Id', 1)

In [46]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [47]:
# разделяем выборки.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [79]:
# строим randomforest.
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=4, random_state=42)
model.fit(X,y)

RandomForestRegressor(max_depth=4, random_state=42)

In [80]:
# выявляем важность признаков.
feat_imp = pd.DataFrame({'feature': X.columns, 'imp': model.feature_importances_})
feat_imp.sort_values(by='imp', ascending='').head()

Unnamed: 0,feature,imp
3,OverallQual,0.695311
15,GrLivArea,0.11178
13,2ndFlrSF,0.033804
11,TotalBsmtSF,0.031476
25,GarageCars,0.028421


In [77]:
y_pred = model.predict(X_train)

In [78]:
print('R^2:', metrics.r2_score(y_train, y_pred))
print('MAE:', metrics.mean_absolute_error(y_train, y_pred))

R^2: 0.7823329655080782
MAE: 25785.496711992062


In [106]:
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import StackingRegressor

In [120]:
# обучаем стекинг.
estimators = [('ECV', ElasticNetCV()), ('lr', RidgeCV()),('svr', LinearSVR(random_state=42))]
reg = StackingRegressor(estimators=estimators, 
final_estimator=RandomForestRegressor(n_estimators=10, random_state=42))

In [119]:
reg.fit(X_train, y_train).score(X_test, y_test)



0.9038826321258349