In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import regression, make_scorer ,mean_squared_error, mean_absolute_error
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2,rfe
import time
import bisect
from custom_preprocessor import custom_preprocessor
from ModifiedLabelEncoder import ModifiedLabelEncoder

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
pipe = Pipeline(steps=[
    ('imputer',custom_preprocessor()),
    ('label_encoding',ModifiedLabelEncoder()),
#     ('feature_selection', SelectKBest(chi2,k = 30 )),
    ('model', AdaBoostRegressor(base_estimator=RandomForestRegressor(n_estimators=20)))
])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice',1),data.SalePrice,random_state= 42,  test_size = 0.3)

In [5]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('imputer', custom_preprocessor()),
                ('label_encoding', ModifiedLabelEncoder()),
                ('model',
                 AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                                        criterion='mse',
                                                                        max_depth=None,
                                                                        max_features='auto',
                                                                        max_leaf_nodes=None,
                                                                        min_impurity_decrease=0.0,
                                                                        min_impurity_split=None,
                                                                        min_samples_leaf=1,
                                                                        min_samples_split=2,
        

In [6]:
pipe.score(X_test, y_test)

0.9086037455049816

In [7]:
mean_absolute_error(pipe.predict(X_test),y_test)

15895.25091324201

In [8]:
y_test.mean(), y_test.std()

(180007.70319634702, 83630.5084217593)

In [9]:
start = time.time()
pipe.predict(X_test.iloc[0].to_frame().T)
end = time.time()
print(str(end - start))
# 0.08819437026977539 with gradient boosting
# 0.13999724388122559 with adaboost
# 0.1629941463470459 with adaboost:base estimator:random forest with 20 estimator

0.279996395111084


In [10]:
y_pred = pipe.predict(X_test.iloc[:20])
pd.DataFrame({'true':data.SalePrice.iloc[:20], 'pred': y_pred}).head()

Unnamed: 0,true,pred
0,208500,137815.0
1,181500,320687.5
2,223500,114095.0
3,140000,158597.5
4,250000,334707.5


In [11]:
import joblib

In [13]:
joblib.dump(pipe,'pipe.joblib')

['pipe.joblib']