In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# -----------------------
# 0. Package load
# -----------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
# -----------------------
# 1. Load Data and Display Histograms
# -----------------------
# Load data files (ensure these are in your working directory)
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
test_ids = test["Id"]

# Drop Id from both sets for modeling
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Plot histogram of the original SalePrice distribution (raw, non-transformed)
plt.figure(figsize=(8,5))
plt.hist(train["SalePrice"], bins=40, color='lightgreen', edgecolor='black')
plt.title("Original SalePrice Distribution")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# Log-transform SalePrice (to help reduce right-tail skew) and plot its histogram
train["SalePrice"] = np.log1p(train["SalePrice"])
plt.figure(figsize=(8,5))
plt.hist(train["SalePrice"], bins=40, color='skyblue', edgecolor='black')
plt.title("Log-Transformed SalePrice Distribution")
plt.xlabel("log(SalePrice)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()



In [None]:
# -----------------------
# 2. Preprocessing Setup
# -----------------------
# Separate target and features from the training set.
y = train["SalePrice"]
train_features = train.drop("SalePrice", axis=1)

# Combine training features and test set for consistent preprocessing
all_data = pd.concat([train_features, test], axis=0)

# Identify numerical and categorical columns
numeric_cols = all_data.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = all_data.select_dtypes(include=["object"]).columns.tolist()

# Create pipeline for numerical features: impute with median then scale
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Create pipeline for categorical features: impute with most frequent then one-hot encode (drop first to avoid collinearity)
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False))
])

# Combine transformations in a ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])



In [None]:
# -----------------------
# 3. Model Pipelines
# -----------------------
# Model 1: Lasso Regression with Variable Selection after Polynomial (interaction) expansion.
# We expose the PolynomialFeatures step as "poly" to retrieve feature names later.
model1_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),  # includes interaction and squared terms
    ("lasso", LassoCV(cv=5, random_state=1234, max_iter=10000))
])

# Model 2: PCA for dimension reduction after polynomial expansion, followed by Linear Regression.
model2_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("pca", PCA(n_components=100, random_state=1234)),  
    ("linreg", LinearRegression())
])


In [None]:
# -----------------------
# 4. Split Data for Evaluation
# -----------------------
# Split only the training data (the rows in all_data corresponding to train_features)
X_all = all_data.iloc[:train_features.shape[0], :]
X_train, X_val, y_train, y_val = train_test_split(X_all, y, test_size=0.2, random_state=1234)


In [None]:
# -----------------------
# 5. Fit and Evaluate Model 1 (Lasso with Polynomial Features)
# -----------------------
# Fit Model 1 on training set
model1_pipeline.fit(X_train, y_train)

# Predict on validation set and evaluate using RMSE (convert predictions back from log-scale with np.expm1)
y_pred_val_m1 = model1_pipeline.predict(X_val)
rmse_model1 = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_pred_val_m1)))
print(f"Model 1 (Lasso) RMSE on validation set: {rmse_model1:.4f}")

# --- Assumption Checks for Model 1 ---
# Residuals and Q-Q Plot (on log-transformed scale)
y_pred_train_m1 = model1_pipeline.predict(X_train)
residuals = y_train - y_pred_train_m1

plt.figure(figsize=(8,5))
plt.hist(residuals, bins=40, color='violet', edgecolor='black')
plt.title("Model 1 Residuals Histogram")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

plt.figure(figsize=(8,5))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Model 1 Q-Q Plot of Residuals")
plt.show()

# Print cross-validated RMSE as well
cv_rmse_model1 = np.mean(np.sqrt(-cross_val_score(model1_pipeline, X_all, y, scoring="neg_mean_squared_error", cv=5)))
#print(f"Model 1 Cross-Validated RMSE: {cv_rmse_model1:.4f}")

# --- Model 1 Coefficients (Non-zero) ---
# Retrieve the names of features after preprocessing and polynomial expansion.
preproc_feature_names = model1_pipeline.named_steps["preprocessor"].get_feature_names_out()
poly_feature_names = model1_pipeline.named_steps["poly"].get_feature_names_out(preproc_feature_names)

lasso_model = model1_pipeline.named_steps["lasso"]
nonzero_indices = np.where(lasso_model.coef_ != 0)[0]
print("\n--- Nonzero Coefficients from Model 1 (Lasso) ---")
for idx in nonzero_indices:
    print(f"{poly_feature_names[idx]}: {lasso_model.coef_[idx]:.4f}")
    print(f"Intercept: {lasso_model.intercept_:.4f}")


In [None]:
# -----------------------
# 6. Fit and Evaluate Model 2 (PCA + Linear Regression)
# -----------------------
#model2_pipeline.fit(X_all, y)  # fit on all training data
#y_pred_val_m2 = model2_pipeline.predict(X_val)
#rmse_model2 = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_pred_val_m2)))
#print(f"\nModel 2 (PCA + Linear Regression) RMSE on validation set: {rmse_model2:.4f}")

# Print PCA summary details: number of components and some regression coefficients on PCA components.
#pca_model = model2_pipeline.named_steps["pca"]
#print(f"Model 2 - Number of PCA Components retained: {pca_model.n_components_}")
#linreg_model = model2_pipeline.named_steps["linreg"]
#print("Model 2 - Linear Regression Coefficients on PCA components:")
#print(linreg_model.coef_)

In [None]:
# -----------------------
# 7. Final Predictions and Kaggle Submission Files
# -----------------------
# Prepare the test set (rows in all_data corresponding to test features).
X_test_final = all_data.iloc[train_features.shape[0]:, :]

# Model 1 predictions on the test set and export the submission file.
model1_test_preds = model1_pipeline.predict(X_test_final)
submission_model1 = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": np.expm1(model1_test_preds)
})
submission_model1.to_csv("submission.csv", index=False)
print("\nKaggle submission file 'submission_model1.csv' created for Model 1.")


In [None]:
# Model 2 predictions on the test set and export the submission file.
#model2_test_preds = model2_pipeline.predict(X_test_final)
#submission_model2 = pd.DataFrame({
#    "Id": test_ids,
#    "SalePrice": np.expm1(model2_test_preds)
#})
#submission_model2.to_csv("submission.csv", index=False)
#print("Kaggle submission file 'submission_model2.csv' created for Model 2.")