In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor

# =============================================================================
# 1. Data Loading and Initial Setup
# =============================================================================

# Load data
try:
    train_df_raw = pd.read_csv('../data/raw/train.csv')
    test_df_raw = pd.read_csv('../data/raw/test.csv')
except FileNotFoundError:
    print("Error: Make sure train.csv and test.csv are in 'house-prices-advanced/data/raw/'")
    # Exit or handle error appropriately in a real script
    exit()


print("Initial Train shape: ", train_df_raw.shape)
print("Initial Test shape: ", test_df_raw.shape)

# Store IDs and target variable, then combine data
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']

# Keep original SalePrice for outlier detection later
train_sale_price = train_df_raw['SalePrice'] 
y_log = np.log1p(train_sale_price)

# Drop unnecessary columns
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)

# Combine train and test data for processing
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

print("Combined data shape: ", all_data.shape)


# =============================================================================
# 2. Data Cleaning and Feature Engineering
# =============================================================================

# --- Filling Missing Values ---
print("\nProcessing missing values...")
# Categorical features where NaN means 'None'
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')

# Numerical features where NaN means 0
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
    
# LotFrontage: fill with median of the neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Other features: fill with the mode
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

if all_data.isnull().sum().sum() == 0:
    print("All missing values handled.")
else:
    print("Warning: There are still missing values.")

# --- Feature Engineering ---
print("\nPerforming feature engineering...")
# Convert some numerical variables into categorical
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

# Create new combined features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# Log-transform skewed numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
print(f"Applied log transformation to {len(skewed_feats)} skewed features.")

# --- One-Hot Encoding ---
all_data = pd.get_dummies(all_data)
print("Shape after one-hot encoding: ", all_data.shape)


# =============================================================================
# 3. Outlier Removal and Final Data Split
# =============================================================================
print("\nRemoving outliers and finalizing datasets...")
# Separate data back into train and test
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]

# Identify and remove outliers from X and y_log
# We use the original GrLivArea and SalePrice for this
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index

X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)

print(f"Removed {len(outlier_indices)} outliers.")
print("Final shape of X:", X.shape)
print("Final shape of y:", y.shape)


# =============================================================================
# 4. Model Training and Prediction
# =============================================================================
# Define Models (from our best submission)
ridge = Ridge(alpha=15)
lasso = Lasso(alpha=0.0004, max_iter=5000)
elasticnet = ElasticNet(alpha=0.0005, l1_ratio=0.9)
xgb = XGBRegressor(learning_rate=0.05, n_estimators=3460,
                   max_depth=3, min_child_weight=0,
                   gamma=0, subsample=0.7,
                   colsample_bytree=0.7,
                   reg_alpha=0.005,
                   nthread=-1,
                   scale_pos_weight=1, seed=27,
                   random_state=42)

# Train on Full Cleaned Data
print("\nTraining models on the full, cleaned dataset... (this may take a few minutes)")
ridge.fit(X, y)
lasso.fit(X, y)
elasticnet.fit(X, y)
xgb.fit(X, y) 
print("All models trained.")

# Blend Predictions
print("\nMaking and blending predictions...")
ridge_preds = np.expm1(ridge.predict(X_test))
lasso_preds = np.expm1(lasso.predict(X_test))
elasticnet_preds = np.expm1(elasticnet.predict(X_test))
xgb_preds = np.expm1(xgb.predict(X_test))

# These are the weights that gave us the 36th place score
blended_preds = (0.35 * lasso_preds + 
                 0.10 * elasticnet_preds + 
                 0.10 * ridge_preds +
                 0.45 * xgb_preds)

# Create Submission File
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': blended_preds})
# Save to the correct submissions folder
submission.to_csv('../submissions/submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully in 'house-prices-advanced/submissions/'!")
print(submission.head())

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb # Import LightGBM

# =============================================================================
# 1. Data Loading and Processing (The same as before)
# =============================================================================

print("--- Starting Data Processing ---")
# Load data
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')

# Prep
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice'] 
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

# Missing Values
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Feature Engineering
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# One-Hot Encoding
all_data = pd.get_dummies(all_data)

# Outlier Removal and Data Split
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")


# =============================================================================
# 2. Model Training and Prediction with LightGBM
# =============================================================================

# Define all models, now including LightGBM
models = {
    'Ridge': Ridge(alpha=15),
    'Lasso': Lasso(alpha=0.0004, max_iter=5000),
    'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460,
                            max_depth=3, min_child_weight=0, gamma=0, subsample=0.7,
                            colsample_bytree=0.7, reg_alpha=0.005, nthread=-1,
                            scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=720,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11,
                                  random_state=42)
}

# Train all models and store their predictions
predictions = {}
print("\n--- Training All Models ---")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    # Predict on test data and reverse log transform
    predictions[name] = np.expm1(model.predict(X_test))
print("--- All Models Trained ---")


# =============================================================================
# 3. Blending Predictions and Creating Submission
# =============================================================================

# Blend predictions giving most weight to XGBoost and LightGBM
blended_preds = (0.10 * predictions['Ridge'] +
                 0.20 * predictions['Lasso'] +
                 0.10 * predictions['ElasticNet'] +
                 0.30 * predictions['XGBoost'] +
                 0.30 * predictions['LightGBM'])

# Create Submission File
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': blended_preds})
submission.to_csv('../submissions/submission_lgbm.csv', index=False)

print("\nSubmission file 'submission_lgbm.csv' created successfully!")
print(submission.head())


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb

# =============================================================================
# 1. Data Processing (The same as before)
# =============================================================================

print("--- Starting Data Processing ---")
# Load data
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')

# Prep
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice'] 
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

# Missing Values
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Feature Engineering
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# One-Hot Encoding
all_data = pd.get_dummies(all_data)

# Outlier Removal and Data Split
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")


# =============================================================================
# 2. Model Training and Prediction
# =============================================================================

models = {
    'Ridge': Ridge(alpha=15),
    'Lasso': Lasso(alpha=0.0004, max_iter=5000),
    'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460,
                            max_depth=3, min_child_weight=0, gamma=0, subsample=0.7,
                            colsample_bytree=0.7, reg_alpha=0.005, nthread=-1,
                            scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=720,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11,
                                  random_state=42)
}

predictions = {}
print("\n--- Training All Models ---")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    predictions[name] = np.expm1(model.predict(X_test))
print("--- All Models Trained ---")


# =============================================================================
# 3. Blending Predictions (Ensemble of Ensembles) and Creating Submission
# =============================================================================

# First, create a combined prediction from the two boosting models
boosting_preds = 0.5 * predictions['XGBoost'] + 0.5 * predictions['LightGBM']

# Now, blend this combined prediction with the linear models
# We give the boosting part the most weight
final_blended_preds = (0.70 * boosting_preds +
                       0.15 * predictions['Lasso'] +
                       0.10 * predictions['Ridge'] +
                       0.05 * predictions['ElasticNet'])


# Create Submission File
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': final_blended_preds})
submission.to_csv('../submissions/submission_ensemble_of_ensembles.csv', index=False)

print("\nSubmission file 'submission_ensemble_of_ensembles.csv' created successfully!")
print(submission.head())

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import lightgbm as lgb

# =============================================================================
# 1. Data Processing (with updated Feature Engineering)
# =============================================================================

print("--- Starting Data Processing ---")
# Load data
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')

# Prep
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice']
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

# Missing Values (same as before)
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# --- NEW: Ordinal Feature Encoding ---
print("\nPerforming feature engineering with ordinal encoding...")
# List of features that have a clear order
ordinal_features = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
                    'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
                    'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LotShape',
                    'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
                    'YrSold', 'MoSold']

for col in ordinal_features:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))

# --- Feature Engineering (same as before, but on remaining features) ---
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# Log-transform skewed numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# One-Hot Encoding for the *remaining* categorical features
all_data = pd.get_dummies(all_data)

# Outlier Removal and Data Split
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")


# =============================================================================
# 2. Model Training and Prediction (same as our best attempt)
# =============================================================================
models = {
    'Ridge': Ridge(alpha=15),
    'Lasso': Lasso(alpha=0.0004, max_iter=5000),
    'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460,
                            max_depth=3, min_child_weight=0, gamma=0, subsample=0.7,
                            colsample_bytree=0.7, reg_alpha=0.005, nthread=-1,
                            scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=720,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11,
                                  random_state=42)
}

predictions = {}
print("\n--- Training All Models ---")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    predictions[name] = np.expm1(model.predict(X_test))
print("--- All Models Trained ---")


# =============================================================================
# 3. Blending and Submission (same as our best attempt)
# =============================================================================
blended_preds = (0.10 * predictions['Ridge'] +
                 0.20 * predictions['Lasso'] +
                 0.10 * predictions['ElasticNet'] +
                 0.30 * predictions['XGBoost'] +
                 0.30 * predictions['LightGBM'])

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': blended_preds})
submission.to_csv('../submissions/submission_label_encoding.csv', index=False)

print("\nSubmission file 'submission_label_encoding.csv' created successfully!")
print(submission.head())


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb

# =============================================================================
# 1. Data Processing (Our Best Pipeline)
# =============================================================================
print("--- Starting Data Processing ---")
# Load data
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')

# Prep
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice']
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

# Missing Values
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Ordinal Feature Encoding
ordinal_features = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
                    'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
                    'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LotShape',
                    'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
                    'YrSold', 'MoSold']
for col in ordinal_features:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))

# Other Feature Engineering
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# One-Hot Encoding for remaining categoricals
all_data = pd.get_dummies(all_data)

# Outlier Removal and Data Split
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")


# =============================================================================
# 2. OOF Predictions and Weight Optimization
# =============================================================================

# Define models
models = {
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=720,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11,
                                  random_state=42),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460,
                            max_depth=3, min_child_weight=0, gamma=0, subsample=0.7,
                            colsample_bytree=0.7, reg_alpha=0.005, nthread=-1,
                            scale_pos_weight=1, seed=27, random_state=42),
    'Lasso': Lasso(alpha=0.0004, max_iter=5000)
}

# Generate OOF predictions
print("\n--- Generating OOF Predictions (this will take time) ---")
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
oof_train = np.zeros((X.shape[0], len(models)))
oof_test = np.zeros((X_test.shape[0], len(models)))
X_np, y_np, X_test_np = X.values, y.values, X_test.values

for i, (name, model) in enumerate(models.items()):
    print(f"Training and predicting with {name}...")
    test_preds_for_fold = np.zeros((X_test.shape[0], kfolds.n_splits))
    for j, (train_idx, val_idx) in enumerate(kfolds.split(X_np)):
        model.fit(X_np[train_idx], y_np[train_idx])
        oof_train[val_idx, i] = model.predict(X_np[val_idx])
        test_preds_for_fold[:, j] = model.predict(X_test_np)
    oof_test[:, i] = test_preds_for_fold.mean(axis=1)

# Find optimal weights for the blend
print("\n--- Finding Optimal Blend Weights ---")
def rmse_func(weights, predictions, true_values):
    final_prediction = np.dot(predictions, weights)
    return np.sqrt(mean_squared_error(true_values, final_prediction))

initial_weights = [1/3.] * len(models)
bounds = [(0, 1)] * len(models)
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})

res = minimize(rmse_func, initial_weights, args=(oof_train, y), 
               method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = res.x

print("Optimal weights found:")
for name, weight in zip(models.keys(), optimal_weights):
    print(f"{name}: {weight:.4f}")


# =============================================================================
# 3. Final Submission
# =============================================================================
print("\n--- Creating Final Submission ---")
final_log_preds = np.dot(oof_test, optimal_weights)
final_preds = np.expm1(final_log_preds)

submission_final = pd.DataFrame({'Id': test_ID, 'SalePrice': final_preds})
submission_final.to_csv('../submissions/submission_optimized_weights.csv', index=False)

print("\nOptimized submission 'submission_optimized_weights.csv' created successfully!")
print(submission_final.head())


In [8]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb

# =============================================================================
# 1. Data Processing (Our Best Pipeline) - NO CHANGES HERE
# =============================================================================
print("--- Starting Data Processing ---")
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice']
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
ordinal_features = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LotShape','PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in ordinal_features:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")

# =============================================================================
# 2. Pseudo-Labeling Step
# =============================================================================
print("\n--- Step 1: Initial Training and Pseudo-Label Generation ---")
# Define models
models = {
    'Ridge': Ridge(alpha=15),
    'Lasso': Lasso(alpha=0.0004, max_iter=5000),
    'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.005, nthread=-1, scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, random_state=42)
}

# Train models and get initial predictions for the test set
initial_predictions = {}
for name, model in models.items():
    print(f"Initial training for {name}...")
    model.fit(X, y)
    initial_predictions[name] = model.predict(X_test)

# Blend the initial predictions to create the pseudo-labels
pseudo_labels_log = (0.10 * initial_predictions['Ridge'] +
                     0.20 * initial_predictions['Lasso'] +
                     0.10 * initial_predictions['ElasticNet'] +
                     0.30 * initial_predictions['XGBoost'] +
                     0.30 * initial_predictions['LightGBM'])

# --- Create the new, combined dataset ---
print("\n--- Step 2: Creating combined dataset with pseudo-labels ---")
X_combined = pd.concat([X, X_test]).reset_index(drop=True)
y_combined = np.concatenate([y, pseudo_labels_log])

print("Shape of combined training data:", X_combined.shape)
print("Shape of combined labels:", y_combined.shape)


# =============================================================================
# 3. Final Training and Submission
# =============================================================================
print("\n--- Step 3: Re-training models on the combined dataset ---")
final_predictions = {}
for name, model in models.items():
    print(f"Final training for {name}...")
    # Re-initialize the model to be safe
    model.fit(X_combined, y_combined)
    final_predictions[name] = np.expm1(model.predict(X_test))

# Blend the final predictions
print("\n--- Blending final predictions ---")
final_blended_preds = (0.10 * final_predictions['Ridge'] +
                       0.20 * final_predictions['Lasso'] +
                       0.10 * final_predictions['ElasticNet'] +
                       0.30 * final_predictions['XGBoost'] +
                       0.30 * final_predictions['LightGBM'])

# Create Submission File
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': final_blended_preds})
submission.to_csv('../submissions/submission_pseudo_labeling.csv', index=False)

print("\nSubmission file with Pseudo-Labeling 'submission_pseudo_labeling.csv' created successfully!")
print(submission.head())

--- Starting Data Processing ---
--- Data Processing Finished ---

--- Step 1: Initial Training and Pseudo-Label Generation ---
Initial training for Ridge...
Initial training for Lasso...
Initial training for ElasticNet...
Initial training for XGBoost...
Initial training for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1552
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 185
[LightGBM] [Info] Start training from score 12.024015

--- Step 2: Creating combined dataset with pseudo-labels ---
Shape of combined training data: (2917, 227)
Shape of combined labels: (2917,)

--- Step 3: Re-training models on the combined dataset ---
Final training for Ridge...
Final training for Lasso...
Final training for ElasticNet...
Final training 

In [9]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb

# =============================================================================
# 1. Data Processing (with Interaction Features)
# =============================================================================

print("--- Starting Data Processing ---")
# Load data and initial prep
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice']
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)

# Missing Values
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# --- Feature Engineering ---
print("\nPerforming feature engineering with interactions...")
# Ordinal Encoding (as in our best model)
ordinal_features = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LotShape','PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in ordinal_features:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))

# Combined Features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# --- NEW: Interaction Features ---
# We multiply some of the most important features together
all_data['OverallQual_x_TotalSF'] = all_data['OverallQual'] * all_data['TotalSF']
all_data['GrLivArea_x_OverallQual'] = all_data['GrLivArea'] * all_data['OverallQual']
# Note: YearBuilt is not label-encoded, it's a raw number, which is fine
all_data['YearBuilt_x_OverallQual'] = all_data['YearBuilt'] * all_data['OverallQual']


# Log-transform skewed numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# One-Hot Encoding for remaining categoricals
all_data = pd.get_dummies(all_data)

# Outlier Removal and Data Split
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")

# =============================================================================
# 2. Model Training and Prediction (using our best blend)
# =============================================================================
models = {
    'Ridge': Ridge(alpha=15), 'Lasso': Lasso(alpha=0.0004, max_iter=5000), 'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.005, nthread=-1, scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, random_state=42)
}
predictions = {}
print("\n--- Training All Models ---")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    predictions[name] = np.expm1(model.predict(X_test))
print("--- All Models Trained ---")

# =============================================================================
# 3. Blending and Submission (using our best weights)
# =============================================================================
blended_preds = (0.10 * predictions['Ridge'] + 0.20 * predictions['Lasso'] + 0.10 * predictions['ElasticNet'] + 0.30 * predictions['XGBoost'] + 0.30 * predictions['LightGBM'])
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': blended_preds})
submission.to_csv('../submissions/submission_interactions.csv', index=False)
print("\nSubmission file 'submission_interactions.csv' created successfully!")
print(submission.head())


--- Starting Data Processing ---

Performing feature engineering with interactions...


  model = cd_fast.enet_coordinate_descent(


Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1717
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 188
[LightGBM] [Info] Start training from score 12.024015


In [10]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore') # Suppress convergence warnings for this final run

# =============================================================================
# 1. Data Processing (Our absolute best pipeline with interactions)
# =============================================================================
print("--- Starting Data Processing ---")
train_df_raw = pd.read_csv('../data/raw/train.csv')
test_df_raw = pd.read_csv('../data/raw/test.csv')
train_ID = train_df_raw['Id']
test_ID = test_df_raw['Id']
train_sale_price = train_df_raw['SalePrice']
y_log = np.log1p(train_sale_price)
train_df = train_df_raw.drop(['Id', 'SalePrice'], axis=1)
test_df = test_df_raw.drop('Id', axis=1)
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
ordinal_features = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LotShape','PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in ordinal_features:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[col].values)) 
    all_data[col] = lbl.transform(list(all_data[col].values))
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + 0.5 * all_data['HalfBath'] + all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
all_data['OverallQual_x_TotalSF'] = all_data['OverallQual'] * all_data['TotalSF']
all_data['GrLivArea_x_OverallQual'] = all_data['GrLivArea'] * all_data['OverallQual']
all_data['YearBuilt_x_OverallQual'] = all_data['YearBuilt'] * all_data['OverallQual']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
X = all_data[:len(train_ID)]
X_test = all_data[len(train_ID):]
outlier_indices = train_df_raw[(train_df_raw['GrLivArea'] > 4000) & (train_df_raw['SalePrice'] < 300000)].index
X = X.drop(outlier_indices)
y = y_log.drop(outlier_indices)
print("--- Data Processing Finished ---")

# =============================================================================
# 2. Pseudo-Labeling Step
# =============================================================================
print("\n--- Step 1: Initial Training and Pseudo-Label Generation ---")
models = {
    'Ridge': Ridge(alpha=15), 'Lasso': Lasso(alpha=0.0004, max_iter=10000), 'ElasticNet': ElasticNet(alpha=0.0005, l1_ratio=0.9, max_iter=10000),
    'XGBoost': XGBRegressor(learning_rate=0.05, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.005, nthread=-1, scale_pos_weight=1, seed=27, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, random_state=42)
}
initial_predictions = {}
for name, model in models.items():
    print(f"Initial training for {name}...")
    model.fit(X, y)
    initial_predictions[name] = model.predict(X_test)

pseudo_labels_log = (0.10 * initial_predictions['Ridge'] + 0.20 * initial_predictions['Lasso'] + 0.10 * initial_predictions['ElasticNet'] + 0.30 * initial_predictions['XGBoost'] + 0.30 * initial_predictions['LightGBM'])

print("\n--- Step 2: Creating combined dataset with pseudo-labels ---")
X_combined = pd.concat([X, X_test]).reset_index(drop=True)
y_combined = np.concatenate([y, pseudo_labels_log])

# =============================================================================
# 3. Final Training and Submission
# =============================================================================
print("\n--- Step 3: Re-training models on the combined dataset ---")
final_predictions = {}
for name, model in models.items():
    print(f"Final training for {name}...")
    model.fit(X_combined, y_combined)
    final_predictions[name] = np.expm1(model.predict(X_test))

print("\n--- Blending final predictions ---")
final_blended_preds = (0.10 * final_predictions['Ridge'] + 0.20 * final_predictions['Lasso'] + 0.10 * final_predictions['ElasticNet'] + 0.30 * final_predictions['XGBoost'] + 0.30 * final_predictions['LightGBM'])

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': final_blended_preds})
submission.to_csv('../submissions/submission_final_push.csv', index=False)
print("\nFinal submission 'submission_final_push.csv' created successfully!")
print(submission.head())

--- Starting Data Processing ---
