In [None]:
#Opening the entire file to understand what the data means
with open('data_description.txt', 'r') as file:
    content = file.read()
    print(content)

In [None]:
#viewing the sample data to understand the metadata and its value
import pandas as pd
test_data = pd.read_csv("test.csv")
test_data.head()

In [None]:
train_data = pd.read_csv("train.csv")
train_data.head()

In [None]:
#To know number of rows and column
train_data.shape


The train_data has 1460 rows and 81 columns.

In [None]:
train_data.info()

The datatypes for the columns are as follows:
float64(3): 3 columns containing decimal numbers

int64(35): 35 columns containing whole numbers

object(43): 43 columns containing text/categorical data


In [None]:
train_data.isnull().sum()

In [None]:
# Find which column has missing values, because above commmand does not give complete idea of all the columns
missing_columns = train_data.isnull().sum()
missing_column_name = missing_columns[missing_columns > 0]


print(f"Column with missing values: {missing_column_name}")


In [None]:
missing_cols = train_data.columns[train_data.isnull().any()]

for col in missing_cols:
    print(f"\nColumn: {col}")
    print(f"Data type: {train_data[col].dtype}")
    print(f"Unique values sample: {train_data[col].unique()[:10]}")


In [None]:
# Fill "None" categorical features
none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual",
    "GarageCond", "PoolQC", "Fence", "MiscFeature",
    "MasVnrType"
]

for col in none_cols:
    if col in train_data.columns:
        train_data[col] = train_data[col].fillna("None")


# Fill zero numeric features
zero_cols = [
    "GarageYrBlt", "MasVnrArea",
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath",
    "GarageArea", "GarageCars"
]

for col in zero_cols:
    if col in train_data.columns:
        train_data[col] = train_data[col].fillna(0)


# LotFrontage (important improvement)
train_data["LotFrontage"] = train_data.groupby("Neighborhood")["LotFrontage"]\
                                      .transform(lambda x: x.fillna(x.median()))

train_data["LotFrontage"] = train_data["LotFrontage"].fillna(train_data["LotFrontage"].median())


# Electrical (mode)
if "Electrical" in train_data.columns:
    train_data["Electrical"] = train_data["Electrical"].fillna(
        train_data["Electrical"].mode()[0]
    )


# Final safety fill (just in case)
train_data = train_data.fillna(train_data.median(numeric_only=True))

for col in train_data.select_dtypes(include="object"):
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])


# Check remaining
print("Remaining missing values:", train_data.isnull().sum().sum())

In [None]:
# Total Square Footage
train_data["TotalSF"] = (
    train_data["TotalBsmtSF"] +
    train_data["1stFlrSF"] +
    train_data["2ndFlrSF"]
)

# Total Bathrooms (weighted)
train_data["TotalBath"] = (
    train_data["FullBath"] +
    0.5 * train_data["HalfBath"] +
    train_data["BsmtFullBath"] +
    0.5 * train_data["BsmtHalfBath"]
)

# Total Porch Area
train_data["TotalPorchSF"] = (
    train_data["OpenPorchSF"] +
    train_data["EnclosedPorch"] +
    train_data["3SsnPorch"] +
    train_data["ScreenPorch"] +
    train_data["WoodDeckSF"]
)

# House Age
train_data["HouseAge"] = train_data["YrSold"] - train_data["YearBuilt"]

# Remodel Age
train_data["RemodelAge"] = train_data["YrSold"] - train_data["YearRemodAdd"]

# Was Remodeled?
train_data["Remodeled"] = (
    train_data["YearRemodAdd"] != train_data["YearBuilt"]
).astype(int)

# Has Garage?
train_data["HasGarage"] = (train_data["GarageArea"] > 0).astype(int)

# Has Basement?
train_data["HasBasement"] = (train_data["TotalBsmtSF"] > 0).astype(int)

# Has Pool?
train_data["HasPool"] = (train_data["PoolArea"] > 0).astype(int)

print("Feature engineering complete.")

In [None]:
train_data["OverallQual_TotalSF"] = (
    train_data["OverallQual"] * train_data["TotalSF"]
)

In [None]:
from scipy.stats import skew
import numpy as np

# Separate target
y = np.log1p(train_data["SalePrice"])

# Drop target + Id
X = train_data.drop(["SalePrice", "Id"], axis=1)

# Identify numeric columns
numeric_feats = X.select_dtypes(include=["int64", "float64"]).columns

# Compute skewness
skewness = X[numeric_feats].apply(lambda x: skew(x))
skewed_features = skewness[skewness > 0.75].index

print("Number of skewed features:", len(skewed_features))

# Apply log1p transformation
X[skewed_features] = np.log1p(X[skewed_features])

print("Skewness correction complete.")

In [None]:
X = pd.get_dummies(X)
print("Final feature count:", X.shape[1])

In [None]:
import warnings
warnings.filterwarnings("ignore") 
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

cv = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(
        model, X, y,
        scoring="neg_mean_squared_error",
        cv=cv
    ))
    return rmse.mean()

# Ridge
ridge = Ridge(alpha=10)
print("Ridge RMSE:", rmse_cv(ridge))

# Lasso
lasso = Lasso(alpha=0.0005, random_state=42)
print("Lasso RMSE:", rmse_cv(lasso))

# ElasticNet
elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42)
print("ElasticNet RMSE:", rmse_cv(elastic))

In [None]:
pip install lightgbm

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

cv = KFold(n_splits=5, shuffle=True, random_state=42)

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42
)

rmse_scores = []

for train_idx, val_idx in cv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    preds = lgb_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    rmse_scores.append(rmse)

print("LightGBM RMSE:", np.mean(rmse_scores))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

cv = KFold(n_splits=5, shuffle=True, random_state=42)

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=5000,
    learning_rate=0.01,
    num_leaves=20,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

rmse_scores = []

for train_idx, val_idx in cv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )

    preds = lgb_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    rmse_scores.append(rmse)

print("Tuned LightGBM RMSE:", np.mean(rmse_scores))

In [None]:
import warnings
warnings.filterwarnings("ignore") 
from sklearn.linear_model import Lasso
import numpy as np

alphas = [0.0001, 0.0003, 0.0005, 0.0007, 0.001, 0.002]

for alpha in alphas:
    lasso = Lasso(alpha=alpha, random_state=42)
    score = rmse_cv(lasso)
    print(f"alpha={alpha} -> RMSE: {score}")

In [None]:
import warnings
warnings.filterwarnings("ignore") 
from sklearn.linear_model import Lasso
import numpy as np
import pandas as pd

# Final model
final_model = Lasso(alpha=0.0005, random_state=42)

# Fit on full training data
final_model.fit(X, y)

print("Final model trained.")

In [None]:
import warnings
warnings.filterwarnings("ignore") 
import numpy as np
import pandas as pd

# ---------- Helper: recreate engineered features on test set ----------
def add_engineered_features(df):
    # Make sure required base columns exist; if not, code will raise informative KeyError
    df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
    df["TotalBath"] = df["FullBath"] + 0.5 * df["HalfBath"] + df["BsmtFullBath"] + 0.5 * df["BsmtHalfBath"]
    df["TotalPorchSF"] = (df["OpenPorchSF"] + df["EnclosedPorch"] + df["3SsnPorch"] +
                          df["ScreenPorch"] + df["WoodDeckSF"])
    df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    df["RemodelAge"] = df["YrSold"] - df["YearRemodAdd"]
    df["Remodeled"] = (df["YearRemodAdd"] != df["YearBuilt"]).astype(int)
    df["HasGarage"] = (df["GarageArea"] > 0).astype(int)
    df["HasBasement"] = (df["TotalBsmtSF"] > 0).astype(int)
    df["HasPool"] = (df["PoolArea"] > 0).astype(int)
    # interaction used in training
    df["OverallQual_TotalSF"] = df["OverallQual"] * df["TotalSF"]
    return df

# ---------- 1) Ensure test_data has basic imputations consistent with train ----------
# (If you used more sophisticated imputation in train, re-run the same steps here.)
# Example minimal re-imputation mirroring earlier steps:
# Fill "None" categorical features used earlier:
none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual",
    "GarageCond", "PoolQC", "Fence", "MiscFeature",
    "MasVnrType"
]
for col in none_cols:
    if col in test_data.columns:
        test_data[col] = test_data[col].fillna("None")

# Zero numeric features used earlier
zero_cols = [
    "GarageYrBlt", "MasVnrArea",
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath",
    "GarageArea", "GarageCars"
]
for col in zero_cols:
    if col in test_data.columns:
        test_data[col] = test_data[col].fillna(0)

# LotFrontage by neighborhood: use train medians if you have them else groupby test
if "LotFrontage" in test_data.columns:
    if "Neighborhood" in train_data.columns:
        # use medians computed from train_data (safer)
        lot_med_by_nb = train_data.groupby("Neighborhood")["LotFrontage"].median()
        test_data["LotFrontage"] = test_data.apply(
            lambda r: lot_med_by_nb[r["Neighborhood"]] if pd.isna(r["LotFrontage"]) else r["LotFrontage"],
            axis=1
        )
    else:
        # fallback: fill with global median of test
        test_data["LotFrontage"] = test_data["LotFrontage"].fillna(test_data["LotFrontage"].median())

# Electrical mode
if "Electrical" in test_data.columns:
    test_data["Electrical"] = test_data["Electrical"].fillna(train_data["Electrical"].mode()[0])

# Final safety numeric median fill and categorical mode fill (mirrors earlier)
test_data = test_data.fillna(test_data.median(numeric_only=True))
for col in test_data.select_dtypes(include="object"):
    test_data[col] = test_data[col].fillna(train_data[col].mode()[0] if col in train_data.columns else test_data[col].mode()[0])

# ---------- 2) Add engineered features to test ----------
test_data = add_engineered_features(test_data)

# ---------- 3) Skew correction: use the same skewed_features list you computed for X ----------
# If you don't have 'skewed_features' variable available, recompute from training X numeric cols.
try:
    sf = skewed_features  # variable from your earlier steps
except NameError:
    # Compute from training feature matrix X (before one-hot) if available
    # If X was already one-hot, we recompute skew candidates from train_data numeric columns
    numeric_feats = train_data.select_dtypes(include=[np.number]).columns
    from scipy.stats import skew
    sk = train_data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_features = sk[sk > 0.75].index.tolist()
    sf = skewed_features

# Only transform columns that actually exist in test_data
sf_existing = [c for c in sf if c in test_data.columns]
test_data.loc[:, sf_existing] = np.log1p(test_data.loc[:, sf_existing])

# ---------- 4) One-hot encode test and align with training X ----------
X_test = test_data.drop("Id", axis=1).copy()

X_test = pd.get_dummies(X_test)

# Align columns: reindex to training feature set X.columns (fill missing with 0)
# Ensure X (train feature matrix) variable exists
if 'X' not in globals():
    raise RuntimeError("Training feature matrix `X` not found. Make sure X (processed train features) exists before aligning test.")

X_test = X_test.reindex(columns=X.columns, fill_value=0)

print("Test data columns aligned to training columns. Shape:", X_test.shape)

In [None]:
from sklearn.linear_model import Lasso

final_model = Lasso(alpha=0.0005, random_state=42, max_iter=20000)
final_model.fit(X, y)

In [None]:
import numpy as np

log_preds = final_model.predict(X_test)
final_preds = np.expm1(log_preds)

In [None]:
import pandas as pd

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": final_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully.")

In [None]:
print(submission.shape)
print(submission.isnull().sum())
print(submission["SalePrice"].min(), submission["SalePrice"].max())