# Housing Prices Project (module 70)

## Imports and Settings

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer

from sklearn.datasets import make_regression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFE, RFECV, SelectFromModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, average_precision_score
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score, mean_squared_log_error

## Data Prep (source)

In [3]:
data = pd.read_table("kaggle_data/train.csv", sep=",")

y = "SalePrice"              # <------------------------< INPUT of intended y 

# delete data with NaNs in y feature
data.dropna(subset=[y], inplace=True)

# define y (and X)
y = data.pop(y)
X = data

# split data to train and test
split = 0.2                 # <------------------------< INPUT split portion 
seed = 777                  # <------------------------< INPUT of  random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed)

## Preprocessor (upper reach)

In [4]:
# numerical and categorial features
num_features = X_train.select_dtypes(include="number").columns
cat_features = X_train.select_dtypes(exclude="number").columns

# ordinal and non_ordinal(categorical) features
ord_features = ['ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','KitchenQual','LandSlope','BsmtFinType2','HeatingQC','GarageFinish','GarageQual','GarageCond']
nonord_features = [i for i in cat_features if i not in ord_features]

# find the indices
ord_indices = cat_features.get_indexer(ord_features)
nonord_indices = cat_features.get_indexer(nonord_features)

# define ordinal (and non_ordinal=categorial) branch
ord_features = X_train[cat_features].iloc[:, ord_indices] # df
ord_features = ord_features.columns.tolist()
nonord_features = X_train[cat_features].iloc[:, nonord_indices] # df
cat_features = nonord_features.columns.tolist()

# Define ranking of values for all columns for the ordinal encoder 
ord_cats1 = ["Ex", "Gd", "TA", "Fa", "Po","NA"] #ExterQual
ord_cats2 = ["Ex", "Gd", "TA", "Fa", "Po","NA"] #ExterCond
ord_cats3 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] #BsmtQual
ord_cats4 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] #BsmtCond
ord_cats5 = ['Gd','Av', 'Mn', 'No', "NA"] #BsmtExposure
ord_cats6 = ['GLQ', 'ALQ','BLQ', 'Rec','LwQ', 'Unf', "NA"] #BsmtFinType1
ord_cats7 = ["Ex", "Gd", "TA", "Fa", "Po","NA"] #KitchenQual
ord_cats8 = ["Gtl", "Mod", "Sev","NA"] #LandSlope
ord_cats9 = ['GLQ', 'ALQ','BLQ', 'Rec','LwQ', 'Unf', "NA"] #BsmtFinType2
ord_cats10 = ["Ex", "Gd", "TA", "Fa", "Po","NA"] #HeatingQC
ord_cats11 = ["Fin", "RFn", "Unf", "NA"] #GarageFinish
ord_cats12 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] #GarageQual
ord_cats13 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] #GarageCond

ord_cats = [ord_cats1,ord_cats2, ord_cats3, ord_cats4, ord_cats5, ord_cats6, ord_cats7, ord_cats8, ord_cats9, ord_cats10, ord_cats11, ord_cats12, ord_cats13]

In [5]:
# define transformers for each branch
num_branch = make_pipeline(SimpleImputer(strategy="mean"), MinMaxScaler())
ord_branch = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA").set_output(transform="pandas"),
                           OrdinalEncoder(categories=ord_cats).set_output(transform="pandas"),
                           MinMaxScaler())
cat_branch = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA").set_output(transform="pandas"),
                           OneHotEncoder(handle_unknown='ignore', sparse_output=False))

# create preprocessor
preprocessor = make_column_transformer(
                    (num_branch, num_features),
                    (ord_branch, ord_features),
                    (cat_branch, cat_features))

# fit preprocessor to training data
preprocessor.fit(X_train)

## Estimators (downstream)

In [6]:
# inputs for cross validation 
scoring = "neg_mean_squared_log_error"    #<-----------------------< INPUT
cv_folds = 5                              #<-----------------------< INPUT
grid = {}

# Loop Lists
models =  ["GBR"]#["DTR", "LinReg", "SGDR", "GBR", "RFR", "KNR"] #
model_ops = [GradientBoostingRegressor(criterion="friedman_mse", random_state=seed), RandomForestRegressor(criterion="friedman_mse")]#[ DecisionTreeRegressor(),LinearRegression(),SGDRegressor(),GradientBoostingRegressor(random_state=seed),RandomForestRegressor(criterion="neg_mean_squared_log_error"),KNeighborsRegressor()]

methods = ["Thresh"]#["Thresh.", "K-Best", "RFE", "select-f"] #
method_ops = [VarianceThreshold(threshold=0)]
            # [VarianceThreshold(threshold=0), 
            # SelectKBest(score_func=f_regression, k=20), 
            # RFE(estimator=RandomForestRegressor(), n_features_to_select=50, step=0.05), 
            # SelectFromModel(DecisionTreeRegressor(),threshold=None)] #          

In [7]:
model_scores_df = pd.DataFrame({})
for m, n in zip(models, model_ops):
    for o, p in zip(methods, method_ops):
        
        model = make_pipeline(preprocessor, p, n)
        model = GridSearchCV(model, param_grid=grid, cv=cv_folds)
        model.fit(X_train, y_train)

        model_scores_df.loc[m, o] = round(model.score(X_test, y_test), 4)
model_scores_df

Unnamed: 0,Thresh
GBR,0.9496


In [8]:
model

# Kaggle Challenge (mouth)

In [9]:
# import data
test = pd.read_table("kaggle_data/test.csv", sep=",")

In [10]:
# use model on test data
test["SalePrice"] = model.predict(test)

In [11]:
test.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,124367.462271
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,159939.832789


In [12]:
# prepare data for kaggele upload
upload = test[["Id", "SalePrice"]]
upload.head(2)

Unnamed: 0,Id,SalePrice
0,1461,124367.462271
1,1462,159939.832789


In [13]:
upload.to_csv("kaggle_results/submission_5.csv", index=False)