# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Load CSVs from uploaded path

In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (1460, 81), Test shape: (1459, 80)


# Save IDs and Target Variable

In [3]:
train_ID = train['Id']
test_ID = test['Id']
y = train['SalePrice']
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Combine Train and Test for preprocessing

In [4]:
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)
print("Combined data shape:", all_data.shape)

Combined data shape: (2919, 79)


# Handle Missing Values

In [5]:
for col in all_data.columns:
    if all_data[col].dtype == 'object':
        all_data[col].fillna(all_data[col].mode()[0], inplace=True)
    else:
        all_data[col].fillna(all_data[col].median(), inplace=True)

# Label Encoding for categorical columns

In [6]:
cat_cols = all_data.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    all_data[col] = le.fit_transform(all_data[col])

# Feature Engineering

In [7]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Age'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['TotalBath'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)

# Split Data Back into X and X_test

In [8]:
X = all_data.iloc[:train.shape[0], :]
X_test = all_data.iloc[train.shape[0]:, :]

# Log-transform Target for better model performance

In [9]:
y_log = np.log1p(y)

# Train XGBoost Model

In [10]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42)
xgb.fit(X, y_log)
xgb_preds = np.expm1(xgb.predict(X_test))

# Evaluate with Cross-Validation

In [11]:
scores = cross_val_score(xgb, X, y_log, scoring="neg_root_mean_squared_error", cv=5)
print(f"XGBoost CV RMSE: {-scores.mean():.4f}")

XGBoost CV RMSE: 0.1270


# Submission CSV

In [13]:
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': xgb_preds
})
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' is ready!")

Submission file 'submission.csv' is ready!


# To check Top Few Rows of submission.csv

In [14]:
pd.read_csv("submission.csv").head()

Unnamed: 0,Id,SalePrice
0,1461,123671.164
1,1462,167755.16
2,1463,180264.02
3,1464,195979.05
4,1465,185703.19
