# Steps

In [None]:
# steps
# 1. split df into num and cat features
# 2. fill NaN values
# 3. impute missing values (num)
# 4. create scaler (num)
# 5. make num_transformer pipe
# 6. impute missing values (cat)
# 7. create encorder (cat)
# 8. make cat_transformer pipe
# 9. make column transformer (combine num/cat transformers)
# 10. add model to final pipe

In [None]:
# y_log = np.log(1+y)

# Libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost


# Data Set


In [None]:
# Shape, 
# dtypes,
# duplicates
# missing values count
# look at the missing values

test_df = pd.read_csv("data/test.csv", index_col='Id')
train_df = pd.read_csv('data/train.csv',index_col="Id")

In [None]:
train_df.shape

# Preprocessing Features

In [None]:
train_df.drop_duplicates(inplace=True)

In [None]:
y_train = train_df['SalePrice']
train_df = train_df.drop(columns='SalePrice')

In [None]:
train_df.dtypes.value_counts()

# Numerical Features

In [None]:
# doesn't need to be encoded, does need to be scaled.
X_train_Obj = train_df.select_dtypes(include=('int64','float64'))
X_test_obj = train_df.select_dtypes(include=('int64','float64'))

In [None]:
pipe_num = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scalier', MinMaxScaler())
]).set_output(transform='pandas')

# Ordinal encoder

In [None]:
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "BsmtCond": ['missing', 'Po', 'Fa', 'TA', 'Gd'],
    "BsmtExposure": ['missing', 'No', 'Mn', 'Av', 'Gd'],
    "BsmtFinType1": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtFinType2": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Electrical": ['missing', 'Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    "ExterCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "ExterQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Fence": ['missing', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    "FireplaceQu": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "Functional": ['missing', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    "GarageCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "GarageFinish": ['missing', 'Unf', 'RFn', 'Fin'],
    "GarageQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "HeatingQC": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "KitchenQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "LandContour": ['missing', 'Low', 'Bnk', 'HLS', 'Lvl'],
    "LandSlope": ['missing', 'Sev', 'Mod', 'Gtl'],
    "LotShape": ['missing', 'IR3', 'IR2', 'IR1', 'Reg'],
    "PavedDrive": ['missing', 'N', 'P', 'Y'],
    "PoolQC": ['missing', 'Fa', 'Gd', 'Ex'],
    }

# Catagorical Features

In [None]:
# You don't need to scale these features, Encoding is necessary

X_train_Obj = train_df.select_dtypes(include='object')
X_test_obj = train_df.select_dtypes(include='object')

In [None]:
cat_ord = ["BsmtCond",
    "BsmtExposure","BsmtFinType1","BsmtFinType2","BsmtQual","Electrical","ExterCond",
    "ExterQual","Fence","FireplaceQu","Functional","GarageCond","GarageFinish","GarageQual",
    "HeatingQC","KitchenQual","LandContour", "LandSlope","LotShape","PavedDrive","PoolQC"]
cat_ohe = ['MSZoning', 'Street', 'Alley','Utilities',
       'LotConfig', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'Foundation','Heating', 'CentralAir',
       'GarageType', 'MiscFeature','SaleType', 'SaleCondition']

In [None]:
cat = [feat_ordinal_dict.get(feat) for feat in feat_ordinal_dict]

In [None]:
pipe_cat_ord = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OrdinalEncoder(categories = cat, handle_unknown="use_encoded_value", unknown_value= -1)),
])

In [None]:
pipe_cat_ohe = Pipeline([
    ('imputer',SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='if_binary'))
])

In [None]:
pipe_cat = ColumnTransformer([
    ('pipe_ord', pipe_cat_ord, cat_ord ),
    ('pipe_ohe', pipe_cat_ohe, cat_ohe )
]).set_output(transform='pandas')

In [None]:
pipe_cat_ord


In [None]:
pipe_cat_ohe

In [None]:
pipe_cat

In [None]:
pipe_num

# Preprocessor

In [None]:
preprocessor = ColumnTransformer([
    ('pipe_cat', pipe_cat,make_column_selector(dtype_include='object')),
    ('pipe_num', pipe_num,make_column_selector(dtype_include=('int64','float64'))),
]).set_output(transform='pandas')
preprocessor

In [None]:
preprocessed_data = preprocessor.fit(train_df, y_train)

# Modeling

In [None]:
def model_regressor(preprocessor, regression_model):
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ("model", regression_model)
    ]).set_output(transform='pandas')
    return pipe

In [None]:
Model_RFR = model_regressor(preprocessor, RandomForestRegressor())
Model_RFR.fit(train_df, y_train)


In [None]:
Model_GB = model_regressor(preprocessor, GradientBoostingRegressor())
Model_GB.fit(train_df,y_train)

In [None]:
z_pred = Model_GB.predict(test_df)

# Predictions

In [None]:
y_pred = Model_RFR.predict(test_df)

In [None]:
test1_df = test_df.copy()
test1_df['SalePrice'] = z_pred
test1_df

In [None]:
output = test1_df[['SalePrice']].reset_index()
output

In [None]:
output.to_csv('Gradient_Boosting_Regressor_Submission.csv', index=False)