- Feature Engineering 
- 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
# Load cleaned data
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

# Save target variable and IDs
y_train = train['SalePrice']
test_ids = test['Id']
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [4]:
# Total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalArea'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Age of the house when sold
train['HouseAge'] = train['YrSold'] - train['YearBuilt']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']

# Interaction term: Overall quality * Living area
train['QualLivArea'] = train['OverallQual'] * train['GrLivArea']
test['QualLivArea'] = test['OverallQual'] * test['GrLivArea']

# Drop original columns if needed (optional)
# train.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YrSold', 'YearBuilt'], axis=1, inplace=True)

In [5]:
# List of skewed numerical features (customize based on your EDA)
skewed_features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF']

for col in skewed_features:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

# Log-transform the target variable
y_train = np.log1p(y_train)  # Reverse later with np.expm1()

In [6]:
# Define ordinal mappings (customize based on your data)
ordinal_mappings = {
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
}

for col, mapping in ordinal_mappings.items():
    train[col] = train[col].map(mapping)
    test[col] = test[col].map(mapping).fillna(0)  # Handle missing/unseen categories in test

In [7]:
# Separate categorical and numerical columns
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train.select_dtypes(exclude=['object']).columns.tolist()

# One-hot encode categorical features
train = pd.get_dummies(train, columns=categorical_cols)
test = pd.get_dummies(test, columns=categorical_cols)

# Align train and test to ensure same columns
train, test = train.align(test, join='left', axis=1, fill_value=0)

In [None]:
# Initialize scaler (fit only on training data)
scaler = StandardScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

In [8]:
# Save preprocessed data for modeling
train_processed = pd.DataFrame(train, columns=train.columns)
test_processed = pd.DataFrame(test, columns=train.columns)  # Ensure same columns

train_processed.to_csv('train_processed.csv', index=False)
test_processed.to_csv('test_processed.csv', index=False)

#### **Model Creation**

In [12]:
import pandas as pd

# Load preprocessed data
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

X_train = train  # Features
#y_train = train['SalePrice']              # Target (log-transformed)
X_test = test                             # Test data

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Initialize XGBRegressor
model = XGBRegressor(
    n_estimators=1000,  # Number of boosting rounds
    learning_rate=0.05,  # Learning rate
    max_depth=5,         # Maximum depth of a tree
    subsample=0.8,       # Fraction of samples used for training
    colsample_bytree=0.8,  # Fraction of features used for training
    random_state=42,
    early_stopping_rounds=10  # Stop if no improvement for 10 rounds
)





In [14]:
# Train the model
model.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val_split, y_val_split)],
    verbose=10  # Print progress every 10 rounds
)

[0]	validation_0-rmse:0.41660
[10]	validation_0-rmse:0.29454
[20]	validation_0-rmse:0.22426
[30]	validation_0-rmse:0.18600
[40]	validation_0-rmse:0.16501
[50]	validation_0-rmse:0.15227
[60]	validation_0-rmse:0.14501
[70]	validation_0-rmse:0.14169
[80]	validation_0-rmse:0.13911
[90]	validation_0-rmse:0.13752
[100]	validation_0-rmse:0.13663
[110]	validation_0-rmse:0.13550
[120]	validation_0-rmse:0.13470
[130]	validation_0-rmse:0.13376
[140]	validation_0-rmse:0.13333
[150]	validation_0-rmse:0.13259
[160]	validation_0-rmse:0.13235
[170]	validation_0-rmse:0.13225
[176]	validation_0-rmse:0.13246


In [15]:
# Evaluate on validation set
val_preds = model.predict(X_val_split)
val_rmse = np.sqrt(mean_squared_error(y_val_split, val_preds))
print(f'Validation RMSE: {val_rmse}')

Validation RMSE: 0.13223487170726797


In [18]:
test_preds = model.predict(X_test)
test_preds = np.expm1(test_preds)

In [19]:
# Create submission DataFrame
submission = pd.DataFrame({
    'Id': test_ids,  # Use the saved test IDs
    'SalePrice': test_preds  # Predicted SalePrice
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

In [20]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2
)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best parameters
print(f'Best parameters: {grid_search.best_params_}')

# Predict with the best model
best_model = grid_search.best_estimator_
test_preds = best_model.predict(X_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.9; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.9; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.9; total time=   1.1s
[CV] END

In [21]:
test_preds = model.predict(X_test)
test_preds = np.expm1(test_preds)

In [22]:
# Create submission DataFrame
submission = pd.DataFrame({
    'Id': test_ids,  # Use the saved test IDs
    'SalePrice': test_preds  # Predicted SalePrice
})

# Save to CSV
submission.to_csv('submission01.csv', index=False)