In [49]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# df.info()

In [60]:
def prepare_data(df):
    #Clean function to prepare data
    df = df.drop(columns=['Alley', 'PoolQC', 'MiscFeature'])
    df.PoolArea = df.PoolArea.apply(lambda x: 1 if x>0 else 0)
    df.MasVnrType = df.MasVnrType.fillna(value='None')
    df.MasVnrArea = df.MasVnrArea.fillna(value=0)
    df = df.drop(columns='Electrical')
    df = df.drop(columns=['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
                          'BsmtFinSF2', 'BsmtUnfSF'])
    df = df.drop(columns='FireplaceQu')
    df = df.drop(columns=['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'])
    df.Fence = df.Fence.fillna(value='None')
    df.LotFrontage = df.LotFrontage.fillna(value=df.LotFrontage.median())
    df.MSSubClass = df.MSSubClass.astype(str)
    df.GarageCars = df.GarageCars.fillna(value=0)
    df.GarageArea = df.GarageArea.fillna(value=0)
    df.TotalBsmtSF = df.TotalBsmtSF.fillna(df['1stFlrSF'])
    return df

In [5]:
df = prepare_data(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 63 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null object
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 no

In [6]:
# Preparing answers
y = df['SalePrice'].to_numpy()

In [7]:
# Dropping sale price and creating train feature
df = df.drop(columns='SalePrice')
# df['isTrain'] = 1

dummy_cols = pd.get_dummies(df).columns

length = len(dummy_cols)
i=0
while i < length:
    print(dummy_cols[i:i+10])
    i += 20

In [8]:
ddf = pd.get_dummies(df)

In [9]:
X = ddf.to_numpy()[:,1:]
X

array([[6.500e+01, 8.450e+03, 7.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [8.000e+01, 9.600e+03, 6.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [6.800e+01, 1.125e+04, 7.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [6.600e+01, 9.042e+03, 7.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [6.800e+01, 9.717e+03, 5.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [7.500e+01, 9.937e+03, 5.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Selecting best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(ddf,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(ddf.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 10 best features

                    Specs        Score
3             OverallQual  2436.770591
12              GrLivArea  1470.585010
21             GarageCars  1013.705666
22             GarageArea   926.951287
8             TotalBsmtSF   880.341282
9                1stFlrSF   845.524488
179          ExterQual_TA   774.677019
15               FullBath   668.430296
19           TotRmsAbvGrd   580.762801
5               YearBuilt   548.665821
207        KitchenQual_TA   538.358524
6            YearRemodAdd   504.714855
204        KitchenQual_Ex   496.712958
187      Foundation_PConc   480.156815
7              MasVnrArea   419.328468
20             Fireplaces   406.503866
178          ExterQual_Gd   375.329470
176          ExterQual_Ex   372.621674
197          HeatingQC_Ex   339.398933
88   Neighborhood_NridgHt   281.282911


In [12]:
# Lets take following columns: 
# OverallQual, GrLivArea, GarageCars, GarageArea, TotalBsmtSF, 1stFlrSF, ExterQual_Ex, ExterQual_Fa, ExterQual_Gd, ExterQual_TA,
# FullBath, TotRmsAbvGrd, YearBuilt, KitchenQual_Ex, KitchenQual_Fa, KitchenQual_Gd, KitchenQual_TA, YearRemodAdd

for cur in dummy_cols:
    if str(cur).find('KitchenQual')>-1:
        print(cur)

In [13]:
use_cols = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'ExterQual_Ex', 'ExterQual_Fa', 'ExterQual_Gd',
            'ExterQual_TA', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'KitchenQual_Ex', 'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'YearRemodAdd']

In [14]:
ddf[use_cols].head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,FullBath,TotRmsAbvGrd,YearBuilt,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,YearRemodAdd
0,7,1710,2,548,856,856,0,0,1,0,2,8,2003,0,0,1,0,2003
1,6,1262,2,460,1262,1262,0,0,0,1,2,6,1976,0,0,0,1,1976
2,7,1786,2,608,920,920,0,0,1,0,2,6,2001,0,0,1,0,2002
3,7,1717,3,642,756,961,0,0,0,1,1,7,1915,0,0,1,0,1970
4,8,2198,3,836,1145,1145,0,0,1,0,2,9,2000,0,0,1,0,2000


In [15]:
X = ddf[use_cols].to_numpy()
X

array([[   7, 1710,    2, ...,    1,    0, 2003],
       [   6, 1262,    2, ...,    0,    1, 1976],
       [   7, 1786,    2, ...,    1,    0, 2002],
       ...,
       [   7, 2340,    1, ...,    1,    0, 2006],
       [   5, 1078,    1, ...,    1,    0, 1996],
       [   5, 1256,    1, ...,    0,    1, 1965]], dtype=int64)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[0.01, 0.03, 0.1, 0.3, 1, 3, 10]

In [18]:
model = RidgeCV(alphas=[1e-2], normalize=True, cv=10)

In [19]:
model.fit(X_train, y_train)

RidgeCV(alphas=array([0.01]), cv=10, fit_intercept=True, gcv_mode=None,
        normalize=True, scoring=None, store_cv_values=False)

In [20]:
model.alpha_

0.01

In [21]:
# model.cv_values_

In [22]:
#Best score for single model
model.score(X_test, y_test)

0.8277118217753905

In [28]:
num_rows = int(np.ceil(len(ddf[use_cols])/3))
total_len = len(ddf[use_cols])

In [29]:
fold1_X = X[:num_rows]
fold1_y = y[:num_rows]
fold2_X = X[num_rows:2*num_rows]
fold2_y = y[num_rows:2*num_rows]
fold3_X = X[2*num_rows:]
fold3_y = y[2*num_rows:]

In [35]:
fold1_y.shape

(487,)

In [37]:
np.vstack((fold1_X, fold2_X)).shape

(974, 18)

In [39]:
model1 = RidgeCV(alphas=[1e-2], normalize=True, cv=10)
model2 = LassoCV(normalize=True, cv=10)
model3 = RandomForestRegressor()

In [45]:
model1.fit(np.vstack((fold2_X, fold3_X)), np.hstack((fold2_y, fold3_y)))
model2.fit(np.vstack((fold1_X, fold3_X)), np.hstack((fold1_y, fold3_y)))
model3.fit(np.vstack((fold1_X, fold2_X)), np.hstack((fold1_y, fold2_y)))



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [46]:
rg_score = model1.score(fold1_X, fold1_y)
ls_score = model2.score(fold2_X, fold2_y)
rf_score = model2.score(fold3_X, fold3_y)
print('Ridge score: ', rg_score)
print('Lasso score: ', ls_score)
print('Random forest score: ', rf_score)

Ridge score:  0.8462369540202486
Lasso score:  0.7851675501330588
Random forest score:  0.7548435803500417


In [47]:
#Final run on test set
preds = (model1.predict(X_test) + model2.predict(X_test) + model3.predict(X_test))/3

In [50]:
r2_score(y_test, preds)

0.8891347590369738

### Существенный рост оценки с 0.82 до 0.89

In [74]:
# Let's do it for Kaggle
df = pd.read_csv('test.csv')
idx = df.Id
df = prepare_data(df)
df = pd.get_dummies(df)

df = df[use_cols]

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 18 columns):
OverallQual       1459 non-null int64
GrLivArea         1459 non-null int64
GarageCars        1459 non-null float64
GarageArea        1459 non-null float64
TotalBsmtSF       1459 non-null float64
1stFlrSF          1459 non-null int64
ExterQual_Ex      1459 non-null uint8
ExterQual_Fa      1459 non-null uint8
ExterQual_Gd      1459 non-null uint8
ExterQual_TA      1459 non-null uint8
FullBath          1459 non-null int64
TotRmsAbvGrd      1459 non-null int64
YearBuilt         1459 non-null int64
KitchenQual_Ex    1459 non-null uint8
KitchenQual_Fa    1459 non-null uint8
KitchenQual_Gd    1459 non-null uint8
KitchenQual_TA    1459 non-null uint8
YearRemodAdd      1459 non-null int64
dtypes: float64(3), int64(7), uint8(8)
memory usage: 125.5 KB


In [66]:
X_sub = df.to_numpy()
preds = (model1.predict(X_sub) + model2.predict(X_sub) + model3.predict(X_sub))/3

In [76]:
sub_df = pd.DataFrame()

In [79]:
sub_df['Id'] = idx
sub_df['SalePrice'] = preds
sub_df.head()

Unnamed: 0,Id,SalePrice
0,1461,111073.319698
1,1462,155605.811539
2,1463,169544.319948
3,1464,195207.420979
4,1465,218328.204798


In [81]:
sub_df.to_csv('my_sub.csv', index=False)

## Kaggle result - 0.16382, place - 3720. Где то в районе 60% )) Но без серьезного Feature Engineering