In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
# Load CSV
data = pd.read_csv("HousePricePrediction.csv")

# Check data size
print(data.shape)
data.head()

(2919, 13)


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [3]:
# Rows with SalePrice (for training)
train_df = data[~data["SalePrice"].isna()].copy()

# Rows without SalePrice (to predict later)
test_df  = data[data["SalePrice"].isna()].copy()

print("Train rows:", len(train_df))
print("Test rows :", len(test_df))

Train rows: 1460
Test rows : 1459


In [4]:
# Drop SalePrice + Id (Id is useless for prediction)
X = train_df.drop(columns=["SalePrice", "Id"])
y = train_df["SalePrice"]

# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric features:", numeric_cols)
print("Categorical features:", categorical_cols[:5], "...")

Numeric features: ['MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF2', 'TotalBsmtSF']
Categorical features: ['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st'] ...


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [7]:
# Full pipeline = preprocessing + regression
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# Train
model.fit(X_train, y_train)

In [11]:
# Predictions
y_pred = model.predict(X_valid)

# Metrics
r2 = metrics.r2_score(y_valid, y_pred)
mae = metrics.mean_absolute_error(y_valid, y_pred)

import numpy as np

mse = metrics.mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)



print("R²   :", round(r2, 3))
print("MAE  :", round(mae, 2))
print("RMSE :", round(rmse, 2))

R²   : 0.62
MAE  : 34122.82
RMSE : 54022.75


In [12]:
# Refit on all training rows
model.fit(X, y)

# Predict test rows (if any SalePrice missing)
if not test_df.empty:
    X_test = test_df.drop(columns=["SalePrice", "Id"])
    preds = model.predict(X_test)

    submission = pd.DataFrame({
        "Id": test_df["Id"].values,
        "SalePrice_Pred": preds
    })

    # Save predictions
    submission.to_csv("house_price_predictions_linear.csv", index=False)
    print(submission.head())

     Id  SalePrice_Pred
0  1460   130011.415082
1  1461   173369.527101
2  1462   218763.332367
3  1463   220566.822011
4  1464   169047.388373


In [13]:
# Extract feature names
ohe = model.named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
cat_names = ohe.get_feature_names_out(categorical_cols)
feature_names = np.r_[numeric_cols, cat_names]

# Coefficients
coefs = model.named_steps["regressor"].coef_.ravel()
coef_df = pd.DataFrame({"feature": feature_names, "coef": coefs}).sort_values("coef", ascending=False)

print("Top features pushing price up:")
print(coef_df.head(10))

print("\nTop features pushing price down:")
print(coef_df.tail(10))

Top features pushing price up:
                feature          coef
17        BldgType_1Fam  83202.121951
32    Exterior1st_Stone  57889.966574
23  Exterior1st_AsphShn  40173.792207
6           TotalBsmtSF  37954.852869
0            MSSubClass  36347.767609
3             YearBuilt  22674.410641
25  Exterior1st_BrkFace  18868.789988
19      BldgType_Duplex  17069.362197
4          YearRemodAdd  15005.292520
9           MSZoning_RH  11261.347247

Top features pushing price down:
                feature          coef
30  Exterior1st_MetalSd -11382.126012
34  Exterior1st_VinylSd -13014.778231
26   Exterior1st_CBlock -15394.974304
21      BldgType_TwnhsE -20786.314072
31  Exterior1st_Plywood -22676.351433
7      MSZoning_C (all) -24084.440887
28  Exterior1st_HdBoard -25458.176879
24  Exterior1st_BrkComm -27675.905820
18      BldgType_2fmCon -37365.847256
20       BldgType_Twnhs -42119.322819
