In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from category_encoders.target_encoder import TargetEncoder

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Load the dataset
train_data = pd.read_csv("train.csv")
train_data = train_data.drop("Id", axis=1)

In [3]:
# Check for missing values
print("Missing Values:")
print(train_data.isnull().sum())

Missing Values:
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64


In [4]:
# Explore the dataset
print("\nDataset Preview:")
print(train_data.head())


Dataset Preview:
   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0          60       RL         65.0     8450   Pave   NaN      Reg   
1          20       RL         80.0     9600   Pave   NaN      Reg   
2          60       RL         68.0    11250   Pave   NaN      IR1   
3          70       RL         60.0     9550   Pave   NaN      IR1   
4          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
0         Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
1         Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
2         Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
3         Lvl    AllPub    Corner  ...        0    NaN   NaN         NaN   
4         Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
0       0      2    2008        WD       

In [5]:
# Define feature groups
categorical_features = [
    "LotShape",
    "Street",
    "ExterCond",
    "OverallCond",
    "OverallQual",
    "Condition1",
    "Condition2",
    "Functional",
    "MSZoning",
    "FireplaceQu",
]
target_encoded_features = ["Neighborhood"]
numerical_features = [
    "BedroomAbvGr",
    "BsmtFinSF1",
    "BsmtFullBath",
    "EnclosedPorch",
    "GarageArea",
    "LotArea",
    "1stFlrSF",
    "2ndFlrSF",
    "TotalBsmtSF",
]

In [6]:
mixed_features_df = train_data[
    categorical_features + target_encoded_features + numerical_features
]

target = np.log(train_data["SalePrice"])

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    mixed_features_df, target, test_size=0.2, random_state=0
)

In [8]:
# Apply ColumnTransformer to the entire dataset
preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(handle_unknown="ignore"), categorical_features),
    (TargetEncoder(handle_unknown="value"), target_encoded_features),
    remainder="passthrough",
)

In [9]:
# Try different regression models and find the best one
best_model = None
best_score = float("-inf")
models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge(alpha=3.562)),
    ("Lasso Regression", Lasso(alpha=0.000672)),
    ("ElasticNet Regression", ElasticNet(alpha=0.002807, l1_ratio=0.1)),
]

In [18]:
for name, model in models:
    pipe = make_pipeline(preprocessor, model)
    scores = cross_val_score(pipe, X_train, y_train, cv=10, error_score="raise")
    mean_score = np.mean(scores)
    print(f"{name} Scores:", scores)
    print(f"{name} Mean Score:", mean_score)

    if float(mean_score) > best_score:
        best_score = float(mean_score)
        best_model = model

print(f"\nBest Model: {best_model.__class__.__name__} with Mean Score: {best_score}")

Linear Regression Scores: [0.9167107  0.89893936 0.7872592  0.88105245 0.88733867 0.85202228
 0.8803758  0.927493   0.89955813 0.90938007]
Linear Regression Mean Score: 0.8840129656120099
Ridge Regression Scores: [0.91568227 0.89859086 0.79240329 0.88003308 0.88815803 0.85852922
 0.87608051 0.91722993 0.90157594 0.91017817]
Ridge Regression Mean Score: 0.8838461294908881
Lasso Regression Scores: [0.91888529 0.89925682 0.79537391 0.87226263 0.88746252 0.85693878
 0.87215593 0.91154178 0.90717692 0.90532385]
Lasso Regression Mean Score: 0.8826378434012362
ElasticNet Regression Scores: [0.91559657 0.89898349 0.7946676  0.87733793 0.88640488 0.85805726
 0.87446389 0.91322742 0.90299924 0.90644796]
ElasticNet Regression Mean Score: 0.8828186250865662

Best Model: LinearRegression with Mean Score: 0.8840129656120099


In [11]:
# Fit the best model on the entire training dataset after preprocessing
best_model_pipe = make_pipeline(preprocessor, best_model)
best_model_pipe.fit(X_train, y_train)

In [12]:
# Use the best model for prediction
user_input_features = {
    "LotShape": "Reg",
    "Street": "Pave",
    "ExterCond": "TA",
    "OverallCond": 6,
    "OverallQual": 6,
    "Condition1": "Norm",
    "Condition2": "Norm",
    "Functional": "Mod",
    "MSZoning": "RH",
    "FireplaceQu": "TA",
    "Neighborhood": "Blueste",
    "BedroomAbvGr": 2,
    "BsmtFinSF1": 468,
    "BsmtFullBath": 1,
    "EnclosedPorch": 0,
    "GarageArea": 730,
    "LotArea": 11622,
    "1stFlrSF": 896,
    "2ndFlrSF": 0,
    "TotalBsmtSF": 882,
}

#Collect input from the user for each feature
# for feature in mixed_features_df.columns:
#     user_input = input(f"Enter value for {feature}: ")
#     user_input_features[feature] = user_input

In [13]:
print("\nUser Input:")
for feature, value in user_input_features.items():
    print(f"{feature}: {value}")


User Input:
LotShape: Reg
Street: Pave
ExterCond: TA
OverallCond: 6
OverallQual: 6
Condition1: Norm
Condition2: Norm
Functional: Mod
MSZoning: RH
FireplaceQu: TA
Neighborhood: Blueste
BedroomAbvGr: 2
BsmtFinSF1: 468
BsmtFullBath: 1
EnclosedPorch: 0
GarageArea: 730
LotArea: 11622
1stFlrSF: 896
2ndFlrSF: 0
TotalBsmtSF: 882


In [14]:
# Create a DataFrame with user input
user_input_df = pd.DataFrame([user_input_features])

In [15]:
user_input_df

Unnamed: 0,LotShape,Street,ExterCond,OverallCond,OverallQual,Condition1,Condition2,Functional,MSZoning,FireplaceQu,Neighborhood,BedroomAbvGr,BsmtFinSF1,BsmtFullBath,EnclosedPorch,GarageArea,LotArea,1stFlrSF,2ndFlrSF,TotalBsmtSF
0,Reg,Pave,TA,6,6,Norm,Norm,Mod,RH,TA,Blueste,2,468,1,0,730,11622,896,0,882


In [16]:
# Make predictions using the best model
predicted_log_price = best_model_pipe.predict(user_input_df)

# Convert the predicted log price back to the original scale
predicted_price = np.exp(predicted_log_price)

In [17]:
print(f"\nPredicted House Price: ${predicted_price[0]:,.2f}")


Predicted House Price: $126,980.04
