# House Prices solution with Pipeline 
- Pipeline (make_pipeline)
- Column Transformers (make_colunm_tranformer)
- Logisitic Model
- Basic Transformers (SimpleImputer, KBinsDiscretizer, OneHotEncoder)
- Performance Measure - Accuracy

## Import Libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import sklearn.model_selection as model_selection

#import xgboost as xgb
#import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


## Import Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Check Information of Columns

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

## Select Features as per Data Type

In [4]:
num_cols = train.drop("SalePrice", axis=1).select_dtypes("number").columns
cat_cols = train.select_dtypes("object").columns

## Divide Data into Target and X Variables

In [5]:
X = train.select_dtypes("number").drop("SalePrice", axis=1)
y = train.SalePrice

## Split Titanic Train data into Train and Test Dataset

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

## Make Seperate Pielines for each Data Type

In [7]:
# we instantiate a first Pipeline, that processes our numerical values
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())])

# the same we do for categorical data
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

## Make Pipeline

In [8]:
preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])

In [9]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())])

## Check Train and Test Scores

In [15]:
#Error - need to resolve
cross_val_score(pipe, X_train, y_train)

array([nan, nan, nan, nan, nan])

In [10]:
#Train_Score
train_scores = cross_val_score(pipe, X_train, y_train, cv = 7)

#Test Score
test_scores = cross_val_score(pipe, X_test, y_test, cv = 7)

#Print Train and Test Score
print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')

Train Accuracy : nan +/- nan
Test Accuracy : nan +/- nan


## Fit Model 

In [None]:
pipe.fit(X_train,y_train)

## Extract only X Variables for Prediction

In [None]:
X_submission = test.drop(['Id'], axis = 1)
X_submission.columns

## Make Prediction and Export CSV

In [None]:
prediction = pipe.predict(X_submission)

In [None]:
#Create Submission dataframe 
submission_df = pd.DataFrame({'PassengerId' : test['PassengerId'], 'Survived' : prediction})

submission_df.to_csv('LOG_Model_Pipeline.csv', index = False)

In [None]:
## Scores 0.77033 when Submitted to Kaggle