# House Prices solution with Pipeline 
- Pipeline (make_pipeline)
- Column Transformers (make_colunm_tranformer)
- Logisitic Model
- Basic Transformers (SimpleImputer, KBinsDiscretizer, OneHotEncoder)
- Performance Measure - Accuracy

## Import Libraries

In [20]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import sklearn.model_selection as model_selection

#import xgboost as xgb
#import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


## Import Data

In [21]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Check Information of Columns

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

## Select Features as per Data Type

In [23]:
num_cols = train.drop(['SalePrice','Id'], axis=1).select_dtypes("number").columns
cat_cols = train.select_dtypes("object").columns

## Divide Data into Target and X Variables

In [38]:
X = train.drop(['SalePrice','Id'], axis=1)
y = train.SalePrice

## Split Titanic Train data into Train and Test Dataset

In [39]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

## Make Pipeline

In [63]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
    RobustScaler()), num_cols),
       
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent', fill_value='missing'),
    OneHotEncoder(categories = 'auto',handle_unknown = 'ignore')), cat_cols),
    
)

In [64]:
pipe = make_pipeline(preprocessor,LinearRegression())

## Check Train and Test Scores

In [65]:
#Train Accuracy : 0.654 +/- 0.36
#Test Accuracy : 0.282 +/- 0.42

In [66]:
#Train_Score
train_scores = cross_val_score(pipe, X_train, y_train, cv = 7)

#Test Score
test_scores = cross_val_score(pipe, X_test, y_test, cv = 7)

#Print Train and Test Score
print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')

Train Accuracy : 0.654 +/- 0.36
Test Accuracy : 0.282 +/- 0.42


## Fit Model 

In [53]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

## Extract only X Variables for Prediction

In [54]:
X_submission = test.drop(['Id'], axis = 1)
X_submission.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Make Prediction and Export CSV

In [55]:
prediction = pipe.predict(X_submission)

In [56]:
#Create Submission dataframe 
submission_df = pd.DataFrame({'Id' : test['Id'], 'SalePrice' : prediction})

submission_df.to_csv('Reg_Model_Pipeline.csv', index = False)

In [48]:
## Scores RMSE 0.18479 when Submitted to Kaggle