<a href="https://colab.research.google.com/github/LizardBlizzard/PayloadsAllTheThings/blob/master/house_prices_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from rich import print  # not mandatory
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [2]:
# load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Explore test data

In [3]:
print(f'Train missing values:')
train_missing = train_df[
    train_df.columns[(train_df.isnull().any())]
].isnull().sum() * 100 / train_df.shape[0]
print(train_missing.sort_values(ascending=False))

In [4]:
print(f'Test missing values:')
test_missing = test_df[
    test_df.columns[(test_df.isnull().any())]
].isnull().sum() * 100 / test_df.shape[0]
print(test_missing.sort_values(ascending=False))

In [5]:
print('Columns with missing values in test not missing in train:')
print(test_missing[test_missing.index.difference(train_missing.index)].sort_values(ascending=False))

In [6]:
print('Columns with missing values in train not missing in test:')
print(train_missing[train_missing.index.difference(test_missing.index)].sort_values(ascending=False))

# Linear Regression models

### Define static column type mapping

In [7]:
CATEGORICAL_COLUMNS = [
    "MSZoning","Street","Alley","LotShape","LandContour","Utilities","LotConfig",
    "Neighborhood","Condition1","Condition2","BldgType","HouseStyle","RoofStyle",
    "RoofMatl","Exterior1st","Exterior2nd","MasVnrType","Foundation","Heating",
    "CentralAir","Electrical","GarageType","PavedDrive","MiscFeature","SaleType",
    "SaleCondition"
]

ORDINAL_COLUMNS = [
    "MSSubClass","LandSlope","OverallQual","OverallCond","ExterQual","ExterCond",
    "BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","HeatingQC",
    "KitchenQual","Functional","FireplaceQu","GarageFinish","GarageQual","GarageCond",
    "PoolQC","Fence"
]

NUMERICAL_COLUMNS = [
    "LotFrontage","LotArea","YearBuilt","YearRemodAdd","MasVnrArea","BsmtFinSF1",
    "BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF",
    "GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr",
    "KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea",
    "WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea",
    "MiscVal","MoSold","YrSold"
]

### Split train dataset to train and test for evaluation

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop(columns=['SalePrice']),
    train_df['SalePrice'],
    test_size=0.33,
    random_state=42
)

### Simple linear regression on numerical values

In [54]:
numerical_pipeline = make_pipeline(SimpleImputer(), StandardScaler())

transformer = make_column_transformer(
    (numerical_pipeline, NUMERICAL_COLUMNS),
    remainder='drop'
)

pipeline = make_pipeline(transformer, LinearRegression())
pipeline.fit(X=X_train, y=y_train)
predictions = pipeline.predict(X=X_test)

print(f'Mean absolute error: {mean_absolute_error(y_pred=predictions, y_true=y_test)}')
print(f'Mean squared error: {mean_squared_error(y_pred=predictions, y_true=y_test)}')
print("Coefficient of determination: %.2f" % r2_score(y_test, predictions))
# print("Coefficients: \n", pipeline['linearregression'].coef_)

In [56]:
scores = cross_val_score(pipeline, X=X_train, y=y_train)

print(scores)
print(f"{scores.mean():.2f} accuracy with a standard deviation of {scores.std():.2f}")

A model based on numerical values only doesn't prove very stable or accurate.

#### Regression on categorical and ordinal features

In [57]:
categorical_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown='infrequent_if_exist'))
ordinal_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
numerical_pipeline = make_pipeline(SimpleImputer(), StandardScaler())

transformer = make_column_transformer(
    (categorical_pipeline, CATEGORICAL_COLUMNS),
    (ordinal_pipeline, ORDINAL_COLUMNS),
    # (numerical_pipeline, NUMERICAL_COLUMNS),
    remainder='drop'
)

pipeline = make_pipeline(transformer, LinearRegression())

scores = cross_val_score(pipeline, X=X_train, y=y_train)

print(scores)
print(f"{scores.mean():.2f} accuracy with a standard deviation of {scores.std():.2f}")

Suprisingly better with very little effort.

## Transform ordinal

In [100]:
CATEGORICAL_COLUMNS = [
    "MSSubClass","MSZoning","Street","Alley","LotShape","LandContour",
    "Utilities","LotConfig","Neighborhood","Condition1","Condition2","BldgType",
    "HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd",
    "MasVnrType","Foundation","Heating","CentralAir","Electrical","GarageType",
    "PavedDrive","MiscFeature","SaleType","SaleCondition"
]

ORDINAL_COLUMNS = [
    "LandSlope","OverallQual","OverallCond","ExterQual","ExterCond","BsmtQual",
    "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","HeatingQC",
    "KitchenQual","Functional","FireplaceQu","GarageFinish","GarageQual",
    "GarageCond","PoolQC","Fence"
]

NUMERICAL_COLUMNS = [
    "LotFrontage","LotArea","YearBuilt","YearRemodAdd","MasVnrArea",
    "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF",
    "LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath",
    "HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces",
    "GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF",
    "EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold",
    "YrSold"
]

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="infrequent_if_exist"),
)
numerical_pipeline = make_pipeline(SimpleImputer(), StandardScaler())

ordinal_transformers: list[tuple[Pipeline, list[str]]] = []
ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["Po", "Fa", "TA", "Gd", "Ex"] for _ in range(4)],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["ExterQual", "ExterCond", "HeatingQC", "KitchenQual"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["NA", "Po", "Fa", "TA", "Gd", "Ex"] for _ in range(5)],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["BsmtQual", "BsmtCond", "FireplaceQu", "GarageQual", "GarageCond"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[
                    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"] for _ in range(2)
                ],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["BsmtFinType1", "BsmtFinType2"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["NA", "No", "Mn", "Av", "Gd"]],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["BsmtExposure"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[
                    ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"]
                ],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["Functional"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["NA", "Unf", "RFn", "Fin"]],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["GarageFinish"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["NA", "Fa", "TA", "Gd", "Ex"]],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["PoolQC"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["Sev", "Mod", "Gtl"]],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["LandSlope"],
    )
)

ordinal_transformers.append(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(
                categories=[["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"]],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
        ["Fence"],
    )
)

ordinal_transformers.append(("passthrough", ["OverallQual", "OverallCond"]))

transformer = make_column_transformer(
    *ordinal_transformers,
    (categorical_pipeline, CATEGORICAL_COLUMNS),
    # (numerical_pipeline, NUMERICAL_COLUMNS),
    # remainder="passthrough",
)

In [97]:
import itertools
cols = list(itertools.chain.from_iterable([t[1] for t in ordinal_transformers]))
unhandled_ordinal_columns = list(set(ORDINAL_COLUMNS).difference(set(cols)))
print(f'Unhandled ordinal columns:\n{unhandled_ordinal_columns}')

In [101]:
transformer.fit_transform(X=X_train, y=y_train)

<978x204 sparse matrix of type '<class 'numpy.float64'>'
	with 44069 stored elements in Compressed Sparse Row format>

In [102]:
pipeline = make_pipeline(transformer, LinearRegression())

scores = cross_val_score(pipeline, X=X_train, y=y_train)

print(scores)
print(f"{scores.mean():.2f} accuracy with a standard deviation of {scores.std():.2f}")