In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import regression, make_scorer ,mean_squared_error, mean_absolute_error
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import time


In [59]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [354]:
class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, X,y=None):
        return self.fit(X, y).transform(X)
    def fit(self, X, y=None):
        res = X.copy()
        ## modified fit
        categorical_columns = res.select_dtypes('object').columns
        self.encoders = dict()
        for column in categorical_columns:
            le = LabelEncoder().fit(res[column])
            le = LabelEncoder().fit(list(le.classes_) + ['No'])
            self.encoders[column]  = le
        return self
    def transform(self,X, y=None):
        res = X.copy()
        ####Modified
        categorical_columns = res.select_dtypes('object').columns
        for column in categorical_columns:
            le = LabelEncoder()
            le.classes_ = self.encoders[column].classes_
            res.loc[:,column] = le.transform(res[column]).reshape(-1, 1)
        return res

In [355]:
class custom_preprocessor( BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None ):
        self.categorical_columns = X.select_dtypes('object').columns
        self.numeric_columns = X.select_dtypes('number').columns
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    def transform(self, X, y=None):
        res = X.copy()
        ##custom
        categorical_columns = res.select_dtypes('object').columns
        numeric_columns = res.select_dtypes('number').columns
        
        for column in set(self.categorical_columns) - set(categorical_columns) :
            res[column] = np.nan
        for column in set(self.numeric_columns) - set(numeric_columns) :
            res[column] = np.nan
        for column in self.categorical_columns:
            res[column] = res[column].fillna('No')
        for column in self.numeric_columns:
            res[column] = res[column].fillna(0)
        return res

In [356]:
pipe = Pipeline(steps=[
    ('imputer',custom_preprocessor()),
    ('label_encoding',ModifiedLabelEncoder()),
    ('model', AdaBoostRegressor(base_estimator=RandomForestRegressor(n_estimators=20)))
])

In [357]:
# X_train, X_test, y_train, y_test = train_test_split(cleaning_pipe.fit_transform(data.drop('SalePrice',1)),data.SalePrice, random_state = 42, test_size = 0.3)

In [358]:
X = data.drop('SalePrice',1)
pipe.fit(X, data.SalePrice)


Pipeline(memory=None,
         steps=[('imputer', custom_preprocessor()),
                ('label_encoding', ModifiedLabelEncoder()),
                ('model',
                 AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                                        criterion='mse',
                                                                        max_depth=None,
                                                                        max_features='auto',
                                                                        max_leaf_nodes=None,
                                                                        min_impurity_decrease=0.0,
                                                                        min_impurity_split=None,
                                                                        min_samples_leaf=1,
                                                                        min_samples_split=2,
        

In [359]:
pipe.score(X, data.SalePrice)

0.9956824867686117

In [360]:
mean_absolute_error(pipe.predict(X),data.SalePrice)

4478.6041780821915

In [12]:
start = time.time()
pipe.predict(X_test.iloc[0].to_frame().T)
end = time.time()
print(str(end - start))
# 0.08819437026977539 with gradient boosting
# 0.13999724388122559 with adaboost
# 0.1629941463470459 with adaboost:base estimator:random forest with 20 estimator

0.18000245094299316


In [386]:
y_pred = pipe.predict(X)
pd.DataFrame({'true':data.SalePrice, 'pred': y_pred}).head()

Unnamed: 0,true,pred
0,208500,204300.0
1,181500,175840.0
2,223500,218825.0
3,140000,144000.0
4,250000,253050.0


In [366]:
import joblib

In [378]:
joblib.dump(export_pipe,'pipe.joblib')

['pipe.joblib']