<a href="https://colab.research.google.com/github/JayYongjaeKim/MoLab/blob/main/house_submit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

aiffel_ds_3_house_prices_path = kagglehub.competition_download('aiffel-ds-3-house-prices')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_path = '/kaggle/input/aiffel-ds-3-house-prices/train.csv'
test_path = '/kaggle/input/aiffel-ds-3-house-prices/test.csv'

import pandas as pd

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import optuna

# Handle missing values
threshold = 0.5
train = train.drop(columns=train.columns[train.isnull().mean() > threshold])
test = test.drop(columns=test.columns[test.isnull().mean() > threshold])

for col in train.select_dtypes(include='number').columns:
    train[col] = train[col].fillna(train[col].median())
    if col in test.columns:
        test[col] = test[col].fillna(test[col].median())

for col in train.select_dtypes(include='object').columns:
    train[col] = train[col].fillna(train[col].mode()[0])
    if col in test.columns:
        test[col] = test[col].fillna(test[col].mode()[0])

# Apply log transformation to SalePrice
train['SalePrice'] = np.log1p(train['SalePrice'])

# Feature engineering
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['Age'] = train['YrSold'] - train['YearBuilt']
test['Age'] = test['YrSold'] - test['YearBuilt']

train['RemodAge'] = train['YrSold'] - train['YearRemodAdd']
test['RemodAge'] = test['YrSold'] - test['YearRemodAdd']

# Encode categorical variables
combined = pd.concat([train.drop(columns=['SalePrice']), test], axis=0)

for col in combined.select_dtypes(include='object').columns:
    combined[col] = combined[col].astype('category').cat.codes

train_processed = combined.iloc[:len(train), :].copy()
test_processed = combined.iloc[len(train):, :].copy()

X = train_processed
y = train['SalePrice']
X_test = test_processed

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define objective function for Optuna
def objective(trial, model_name):
    if model_name == 'xgb':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        }
        model = xgb.XGBRegressor(**params)

    elif model_name == 'lgb':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
            'num_leaves': trial.suggest_int('num_leaves', 31, 255),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        }
        model = lgb.LGBMRegressor(**params)

    elif model_name == 'cb':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'depth': trial.suggest_int('depth', 3, 10),
            'iterations': trial.suggest_int('iterations', 500, 1500),
            'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 1.0),
        }
        model = cb.CatBoostRegressor(**params, verbose=0)

    scores = []
    for train_idx, val_idx in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        scores.append(mean_squared_error(y_val, y_val_pred, squared=False))

    return np.mean(scores)

# Optimization for each model
def optimize_model(model_name):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=50)
    print(f"Best parameters for {model_name}: {study.best_params}")
    print(f"Best RMSE for {model_name}: {study.best_value}")
    return study.best_params

# Run optimization
xgb_best_params = optimize_model('xgb')
lgb_best_params = optimize_model('lgb')
cb_best_params = optimize_model('cb')

# Fit final models with optimized parameters
xgb_model = xgb.XGBRegressor(**xgb_best_params)
lgb_model = lgb.LGBMRegressor(**lgb_best_params)
cb_model = cb.CatBoostRegressor(**cb_best_params, verbose=0)

xgb_model.fit(X_scaled, y)
lgb_model.fit(X_scaled, y)
cb_model.fit(X_scaled, y)

# Predictions (with inverse log transformation)
xgb_preds = np.expm1(xgb_model.predict(X_test_scaled))
lgb_preds = np.expm1(lgb_model.predict(X_test_scaled))
cb_preds = np.expm1(cb_model.predict(X_test_scaled))

# Weighted ensemble predictions
final_preds = (0.4 * cb_preds + 0.3 * xgb_preds + 0.3 * lgb_preds)

# Save submission
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': final_preds})
submission.to_csv('submission_optimized_tuned.csv', index=False)


In [None]:
submission