In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn_pandas import DataFrameMapper, CategoricalImputer, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score
import xgboost as xgb
import catboost as cb



from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [6]:
target = 'SalePrice'
y = np.log(df[target])
X = df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
mapper = DataFrameMapper([
    (['MSSubClass'], [SimpleImputer(), StandardScaler()]),
    (['MSZoning'], [CategoricalImputer(), LabelBinarizer()]),
    (['LotFrontage'], [SimpleImputer(), StandardScaler()]),
    (['LotArea'], [SimpleImputer(), StandardScaler()]),
    (['Street'], [CategoricalImputer(), LabelBinarizer()]),
    (['Alley'], [CategoricalImputer(), LabelBinarizer()]),
    (['LotShape'], [CategoricalImputer(), LabelBinarizer()]),
    (['LandContour'], [CategoricalImputer(), LabelBinarizer()]),
    (['Utilities'], [CategoricalImputer(), LabelBinarizer()]),
    (['LotConfig'], [CategoricalImputer(), LabelBinarizer()]),
    (['LandSlope'], [CategoricalImputer(), LabelBinarizer()]),
    (['Neighborhood'], [CategoricalImputer(), LabelBinarizer()]),
    (['Condition1'], [CategoricalImputer(), LabelBinarizer()]),
    (['Condition2'], [CategoricalImputer(), LabelBinarizer()]),
    (['BldgType'], [CategoricalImputer(), LabelBinarizer()]),
    (['HouseStyle'], [CategoricalImputer(), LabelBinarizer()]),
    (['OverallQual'], [SimpleImputer(), StandardScaler()]),
    (['OverallCond'], [SimpleImputer(), StandardScaler()]),
    (['YearBuilt'], [SimpleImputer(), StandardScaler()]),
    (['YearRemodAdd'], [SimpleImputer(), StandardScaler()]),
    (['RoofStyle'], [CategoricalImputer(), LabelBinarizer()]),
    (['RoofMatl'], [CategoricalImputer(), LabelBinarizer()]),
    (['Exterior1st'], [CategoricalImputer(), LabelBinarizer()]),
    (['Exterior2nd'], [CategoricalImputer(), LabelBinarizer()]),
    (['MasVnrType'], [CategoricalImputer(), LabelBinarizer()]),
    (['MasVnrArea'], [SimpleImputer(), StandardScaler()]),
    (['ExterQual'], [CategoricalImputer(), LabelBinarizer()]),
    (['ExterCond'], [CategoricalImputer(), LabelBinarizer()]),
    (['Foundation'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtQual'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtCond'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtExposure'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtFinType1'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtFinSF1'], [SimpleImputer(), StandardScaler()]),
    (['BsmtFinType2'], [CategoricalImputer(), LabelBinarizer()]),
    (['BsmtFinSF2'], [SimpleImputer(), StandardScaler()]),
    (['BsmtUnfSF'], [SimpleImputer(), StandardScaler()]),
    (['TotalBsmtSF'], [SimpleImputer(), StandardScaler()]),
    (['Heating'], [CategoricalImputer(), LabelBinarizer()]),
    (['HeatingQC'], [CategoricalImputer(), LabelBinarizer()]),
    (['CentralAir'], [CategoricalImputer(), LabelBinarizer()]),
    (['Electrical'], [CategoricalImputer(), LabelBinarizer()]),
    (['1stFlrSF'], [SimpleImputer(), StandardScaler()]),
    (['2ndFlrSF'], [SimpleImputer(), StandardScaler()]),
    (['LowQualFinSF'], [SimpleImputer(), StandardScaler()]),
    (['GrLivArea'], [SimpleImputer(), StandardScaler()]),
    (['BsmtFullBath'], [SimpleImputer(), StandardScaler()]),
    (['BsmtHalfBath'], [SimpleImputer(), StandardScaler()]),
    (['FullBath'], [SimpleImputer(), StandardScaler()]),
    (['HalfBath'], [SimpleImputer(), StandardScaler()]),
    (['BedroomAbvGr'], [SimpleImputer(), StandardScaler()]),
    (['KitchenAbvGr'], [SimpleImputer(), StandardScaler()]),
    (['KitchenQual'], [CategoricalImputer(), LabelBinarizer()]),
    (['TotRmsAbvGrd'], [SimpleImputer(), StandardScaler()]),
    (['Functional'], [CategoricalImputer(), LabelBinarizer()]),
    (['Fireplaces'], [SimpleImputer(), StandardScaler()]),
    (['FireplaceQu'], [CategoricalImputer(), LabelBinarizer()]),
    (['GarageType'], [CategoricalImputer(), LabelBinarizer()]),
    (['GarageYrBlt'], [SimpleImputer(), StandardScaler()]),
    (['GarageFinish'], [CategoricalImputer(), LabelBinarizer()]),
    (['GarageCars'], [SimpleImputer(), StandardScaler()]),
    (['GarageArea'], [SimpleImputer(), StandardScaler()]),
    (['GarageQual'], [CategoricalImputer(), LabelBinarizer()]),
    (['GarageCond'], [CategoricalImputer(), LabelBinarizer()]),
    (['PavedDrive'], [CategoricalImputer(), LabelBinarizer()]),
    (['WoodDeckSF'], [SimpleImputer(), StandardScaler()]),
    (['OpenPorchSF'], [SimpleImputer(), StandardScaler()]),
    (['EnclosedPorch'], [SimpleImputer(), StandardScaler()]),
    (['3SsnPorch'], [SimpleImputer(), StandardScaler()]),
    (['ScreenPorch'], [SimpleImputer(), StandardScaler()]),
    (['PoolArea'], [SimpleImputer(), StandardScaler()]),
#     (['PoolQC'], [CategoricalImputer(), LabelBinarizer()]),
    (['Fence'], [CategoricalImputer(), LabelBinarizer()]),
    (['MiscFeature'], [CategoricalImputer(), LabelBinarizer()]),
    (['MiscVal'], [SimpleImputer(), StandardScaler()]),
    (['MoSold'], [SimpleImputer(), StandardScaler()]),
    (['YrSold'], [SimpleImputer(), StandardScaler()]),
    (['SaleType'], [CategoricalImputer(), LabelBinarizer()]),
    (['SaleCondition'], [CategoricalImputer(), LabelBinarizer()]),
 ], df_out=True)

In [8]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [9]:
Z_train

Unnamed: 0,MSSubClass,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LotFrontage,LotArea,Street,Alley,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1023,1.475911,0,0,0,1,0,-1.202174,-0.683950,1,0,...,0,0,0,1,0,0,0,0,1,0
810,-0.871228,0,0,0,1,0,0.333763,-0.054883,1,0,...,0,0,0,1,0,0,0,0,1,0
1384,-0.167086,0,0,0,1,0,-0.456147,-0.152524,1,0,...,0,0,0,1,0,0,0,0,1,0
626,-0.871228,0,0,0,1,0,0.000000,0.144198,1,0,...,0,0,0,1,0,0,0,0,1,0
813,-0.871228,0,0,0,1,0,0.202111,-0.090142,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,-0.871228,0,0,0,1,0,0.333763,-0.129289,1,0,...,0,0,0,1,0,0,0,0,1,0
1130,-0.167086,0,0,0,1,0,-0.236728,-0.266078,1,0,...,0,0,0,1,0,0,0,0,1,0
1294,-0.871228,0,0,0,1,0,-0.456147,-0.232808,1,0,...,0,0,0,1,0,0,0,0,1,0
860,-0.167086,0,0,0,1,0,-0.675567,-0.280725,1,0,...,0,0,0,1,0,0,0,0,1,0


In [10]:
model = LinearRegression()
model.fit(Z_train,y_train)
print(model.score(Z_train,y_train))
print(model.score(Z_test, y_test))

0.944864575834955
-7.702014783684037e+17


In [11]:
# Set up a list of alphas to check.
enet_alphas = np.linspace(0.5, 1.0, 100)

# Set up our l1 ratio
enet_ratio = 0.5

# Instantiate model.
enet_model = ElasticNetCV(alphas=enet_alphas, l1_ratio=enet_ratio, cv=5)

# Fit model using optimal alpha.
enet_model = enet_model.fit(Z_train, y_train)

# Generate predictions.
enet_model_preds = enet_model.predict(Z_test)
enet_model_preds_train = enet_model.predict(Z_train)

# Evaluate model.
print(r2_score(y_test, enet_model_preds))
print(r2_score(y_train, enet_model_preds_train))

0.2081873474635203
0.20653762629904904


In [12]:
model = RandomForestRegressor()
model.fit(Z_train,y_train)
print(model.score(Z_train,y_train))
print(model.score(Z_test, y_test))

0.9801659186457944
0.8825091725123815


In [23]:

# GridSearchCV to find best params for the pipe
model = cb.CatBoostRegressor()
params = {
    'iterations': [100,500],
    'learning_rate': [0.1,0.3,0.7],
    'depth': [4, 10],
    'early_stopping_rounds':[10,20],
    'l2_leaf_reg': [1,3,5]
}
grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)
grid.fit(Z_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 15.3min finished


0:	learn: 0.3684440	total: 49.2ms	remaining: 24.5s
1:	learn: 0.3469480	total: 51.2ms	remaining: 12.8s
2:	learn: 0.3293081	total: 53.1ms	remaining: 8.79s
3:	learn: 0.3098824	total: 55.1ms	remaining: 6.83s
4:	learn: 0.2961471	total: 57.5ms	remaining: 5.69s
5:	learn: 0.2819061	total: 59.7ms	remaining: 4.92s
6:	learn: 0.2677814	total: 61.8ms	remaining: 4.35s
7:	learn: 0.2558077	total: 64.4ms	remaining: 3.96s
8:	learn: 0.2447986	total: 66.8ms	remaining: 3.65s
9:	learn: 0.2346905	total: 69.2ms	remaining: 3.39s
10:	learn: 0.2257729	total: 71.7ms	remaining: 3.19s
11:	learn: 0.2173261	total: 74.2ms	remaining: 3.02s
12:	learn: 0.2094821	total: 76.7ms	remaining: 2.87s
13:	learn: 0.2015804	total: 79.1ms	remaining: 2.75s
14:	learn: 0.1955268	total: 81.6ms	remaining: 2.64s
15:	learn: 0.1895260	total: 84ms	remaining: 2.54s
16:	learn: 0.1840101	total: 86.4ms	remaining: 2.45s
17:	learn: 0.1798985	total: 88.8ms	remaining: 2.38s
18:	learn: 0.1755258	total: 91.1ms	remaining: 2.31s
19:	learn: 0.1711780	tot

208:	learn: 0.0687270	total: 426ms	remaining: 593ms
209:	learn: 0.0686042	total: 428ms	remaining: 591ms
210:	learn: 0.0683965	total: 430ms	remaining: 589ms
211:	learn: 0.0683156	total: 432ms	remaining: 587ms
212:	learn: 0.0681279	total: 434ms	remaining: 584ms
213:	learn: 0.0679264	total: 436ms	remaining: 582ms
214:	learn: 0.0677210	total: 438ms	remaining: 580ms
215:	learn: 0.0674958	total: 440ms	remaining: 578ms
216:	learn: 0.0673494	total: 442ms	remaining: 576ms
217:	learn: 0.0672322	total: 444ms	remaining: 574ms
218:	learn: 0.0670829	total: 446ms	remaining: 573ms
219:	learn: 0.0669075	total: 448ms	remaining: 570ms
220:	learn: 0.0668792	total: 450ms	remaining: 568ms
221:	learn: 0.0667632	total: 452ms	remaining: 566ms
222:	learn: 0.0667275	total: 453ms	remaining: 563ms
223:	learn: 0.0665680	total: 455ms	remaining: 561ms
224:	learn: 0.0664290	total: 457ms	remaining: 558ms
225:	learn: 0.0662588	total: 459ms	remaining: 556ms
226:	learn: 0.0660731	total: 463ms	remaining: 557ms
227:	learn: 

435:	learn: 0.0447299	total: 807ms	remaining: 118ms
436:	learn: 0.0446693	total: 810ms	remaining: 117ms
437:	learn: 0.0445917	total: 813ms	remaining: 115ms
438:	learn: 0.0444743	total: 816ms	remaining: 113ms
439:	learn: 0.0443987	total: 817ms	remaining: 111ms
440:	learn: 0.0443486	total: 824ms	remaining: 110ms
441:	learn: 0.0442286	total: 826ms	remaining: 108ms
442:	learn: 0.0441115	total: 828ms	remaining: 106ms
443:	learn: 0.0439965	total: 829ms	remaining: 105ms
444:	learn: 0.0439149	total: 831ms	remaining: 103ms
445:	learn: 0.0438636	total: 832ms	remaining: 101ms
446:	learn: 0.0438052	total: 834ms	remaining: 98.8ms
447:	learn: 0.0437356	total: 836ms	remaining: 97ms
448:	learn: 0.0436772	total: 837ms	remaining: 95.1ms
449:	learn: 0.0436311	total: 839ms	remaining: 93.2ms
450:	learn: 0.0435248	total: 840ms	remaining: 91.3ms
451:	learn: 0.0434778	total: 842ms	remaining: 89.4ms
452:	learn: 0.0433909	total: 844ms	remaining: 87.6ms
453:	learn: 0.0433330	total: 847ms	remaining: 85.8ms
454:	l

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error(y_test, grid.predict(Z_test)))

0.12263269389013653

In [31]:
out_df = pd.read_csv('test.csv')

In [32]:
ZZ_test = mapper.transform(out_df)

In [None]:
pipe.predict(X_test)

In [None]:
pipe.predict(out_df)

In [33]:
final_df = pd.DataFrame(zip(list(out_df['Id']),grid.predict(ZZ_test)), columns = ['Id','SalePrice'])

In [35]:
final_df['SalePrice'] = np.exp(final_df['SalePrice'])

In [36]:
final_df

Unnamed: 0,Id,SalePrice
0,1461,125746.510000
1,1462,160051.905570
2,1463,186228.170190
3,1464,204644.524705
4,1465,191848.623887
...,...,...
1454,2915,82463.559680
1455,2916,77301.482107
1456,2917,151664.050130
1457,2918,116090.263318


In [37]:
final_df.to_csv('sub2.csv', index=False)