# Steps

In [2]:
# steps
# 1. split df into num and cat features
# 2. fill NaN values
# 3. impute missing values (num)
# 4. create scaler (num)
# 5. make num_transformer pipe
# 6. impute missing values (cat)
# 7. create encorder (cat)
# 8. make cat_transformer pipe
# 9. make column transformer (combine num/cat transformers)
# 10. add model to final pipe

In [3]:
# y_log = np.log(1+y)

# Libraries


In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


# Data Set


In [5]:
# Shape, 
# dtypes,
# duplicates
# missing values count
# look at the missing values

test_df = pd.read_csv("data/test.csv", index_col='Id')
train_df = pd.read_csv('data/train.csv',index_col="Id")

In [6]:
train_df.shape

(1460, 80)

# Preprocessing Features

In [7]:
train_df.drop_duplicates(inplace=True)

In [8]:
y_train = train_df['SalePrice']
train_df = train_df.drop(columns='SalePrice')

In [9]:
train_df.dtypes.value_counts()

object     43
int64      33
float64     3
dtype: int64

# Numerical Features

In [10]:
# doesn't need to be encoded, does need to be scaled.
X_train_Obj = train_df.select_dtypes(include=('int64','float64'))
X_test_obj = train_df.select_dtypes(include=('int64','float64'))

In [11]:
pipe_num = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scalier', MinMaxScaler())
]).set_output(transform='pandas')

# Ordinal encoder

In [12]:
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "BsmtCond": ['missing', 'Po', 'Fa', 'TA', 'Gd'],
    "BsmtExposure": ['missing', 'No', 'Mn', 'Av', 'Gd'],
    "BsmtFinType1": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtFinType2": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Electrical": ['missing', 'Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    "ExterCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "ExterQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Fence": ['missing', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    "FireplaceQu": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "Functional": ['missing', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    "GarageCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "GarageFinish": ['missing', 'Unf', 'RFn', 'Fin'],
    "GarageQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "HeatingQC": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "KitchenQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "LandContour": ['missing', 'Low', 'Bnk', 'HLS', 'Lvl'],
    "LandSlope": ['missing', 'Sev', 'Mod', 'Gtl'],
    "LotShape": ['missing', 'IR3', 'IR2', 'IR1', 'Reg'],
    "PavedDrive": ['missing', 'N', 'P', 'Y'],
    "PoolQC": ['missing', 'Fa', 'Gd', 'Ex'],
    }

# Catagorical Features

In [13]:
# You don't need to scale these features, Encoding is necessary

X_train_Obj = train_df.select_dtypes(include='object')
X_test_obj = train_df.select_dtypes(include='object')

In [14]:
cat_ord = ["BsmtCond",
    "BsmtExposure","BsmtFinType1","BsmtFinType2","BsmtQual","Electrical","ExterCond",
    "ExterQual","Fence","FireplaceQu","Functional","GarageCond","GarageFinish","GarageQual",
    "HeatingQC","KitchenQual","LandContour", "LandSlope","LotShape","PavedDrive","PoolQC"]
cat_ohe = ['MSZoning', 'Street', 'Alley','Utilities',
       'LotConfig', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'Foundation','Heating', 'CentralAir',
       'GarageType', 'MiscFeature','SaleType', 'SaleCondition']

In [15]:
cat = [feat_ordinal_dict.get(feat) for feat in feat_ordinal_dict]

In [16]:
pipe_cat_ord = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OrdinalEncoder(categories = cat, handle_unknown="use_encoded_value", unknown_value= -1)),
])

In [17]:
pipe_cat_ohe = Pipeline([
    ('imputer',SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='if_binary'))
])

In [18]:
pipe_cat = ColumnTransformer([
    ('pipe_ord', pipe_cat_ord, cat_ord ),
    ('pipe_ohe', pipe_cat_ohe, cat_ohe )
]).set_output(transform='pandas')

In [19]:
pipe_cat_ord


In [20]:
pipe_cat_ohe

In [21]:
pipe_cat

In [22]:
pipe_num

# Preprocessor

In [23]:
preprocessor = ColumnTransformer([
    ('pipe_cat', pipe_cat,make_column_selector(dtype_include='object')),
    ('pipe_num', pipe_num,make_column_selector(dtype_include=('int64','float64'))),
]).set_output(transform='pandas')
preprocessor

In [24]:
preprocessed_data = preprocessor.fit(train_df, y_train)

# Modeling

In [25]:
def model_regressor(preprocessor, regression_model):
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ("model", regression_model)
    ]).set_output(transform='pandas')
    return pipe

In [26]:
Model_RFR = model_regressor(preprocessor, RandomForestRegressor())
Model_RFR.fit(train_df, y_train)


In [27]:
Model_GB = model_regressor(preprocessor, GradientBoostingRegressor())
Model_GB.fit(train_df,y_train)

In [28]:
z_pred = Model_GB.predict(test_df)



In [29]:
model_XG =  model_regressor(preprocessor, XGBRegressor())
model_XG.fit(train_df, y_train)

In [30]:
AA_pred = model_XG.predict(test_df)



# Predictions

In [31]:
y_pred = Model_RFR.predict(test_df)



In [32]:
test1_df = test_df.copy()
test1_df['SalePrice'] = AA_pred
test1_df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,6,2010,WD,Normal,128602.593750
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Gar2,12500,6,2010,WD,Normal,153421.968750
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,,0,3,2010,WD,Normal,181518.312500
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,6,2010,WD,Normal,187469.000000
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,1,2010,WD,Normal,180294.265625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2006,WD,Normal,80721.609375
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Abnorml,81369.117188
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Abnorml,172874.593750
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,120361.656250


In [33]:
output = test1_df[['SalePrice']].reset_index()
output

Unnamed: 0,Id,SalePrice
0,1461,128602.593750
1,1462,153421.968750
2,1463,181518.312500
3,1464,187469.000000
4,1465,180294.265625
...,...,...
1454,2915,80721.609375
1455,2916,81369.117188
1456,2917,172874.593750
1457,2918,120361.656250


In [35]:
output.to_csv('XGboost_Submission.csv', index=False)