In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [2]:
df = pd.read_csv('AmesHousing.csv')
df.columns = df.columns.str.replace(' ', '')

In [3]:
#This is used to find the percentage of missing values----------------
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum() / df.isnull().count() * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data = missing_data[missing_data['Total'] > 0]

print(missing_data)

              Total    Percent
PoolQC         2917  99.556314
MiscFeature    2824  96.382253
Alley          2732  93.242321
Fence          2358  80.477816
MasVnrType     1775  60.580205
FireplaceQu    1422  48.532423
LotFrontage     490  16.723549
GarageQual      159   5.426621
GarageYrBlt     159   5.426621
GarageCond      159   5.426621
GarageFinish    159   5.426621
GarageType      157   5.358362
BsmtExposure     83   2.832765
BsmtFinType2     81   2.764505
BsmtQual         80   2.730375
BsmtCond         80   2.730375
BsmtFinType1     80   2.730375
MasVnrArea       23   0.784983
BsmtFullBath      2   0.068259
BsmtHalfBath      2   0.068259
TotalBsmtSF       1   0.034130
BsmtFinSF1        1   0.034130
BsmtFinSF2        1   0.034130
GarageArea        1   0.034130
GarageCars        1   0.034130
BsmtUnfSF         1   0.034130
Electrical        1   0.034130


In [4]:
'''Now, here it's possible that some of the columns have no value, because there's nothing htere genuinly, like 0 means no garage.
So we also have to look out for that'''

# 1. The "None" Class (Categorical features where NaN means "Not Present")
none_cols = [
    'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
    'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
    'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 
    'MasVnrType'
]

# Fill these with the string "None"
for col in none_cols:
    df[col] = df[col].fillna('None')

# 2. The "Zero" Class (Numerical features where NaN means 0)
zero_cols = [
    'GarageYrBlt', 'GarageArea', 'GarageCars', 
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 
    'MasVnrArea'
]

# Fill these with the number 0
for col in zero_cols:
    df[col] = df[col].fillna(0)


df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# df.mode() returns a dataframe, so we take [0] to get the first value
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

# --- VERIFICATION ---
# Check if anything is still missing
print("Remaining Missing Values:", df.isnull().sum().sum())

Remaining Missing Values: 3


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [5]:
'''Now we check for skewness of sales price, it's skewed 1.7 hence we use log transform'''
df['SalePrice'] = np.log1p(df['SalePrice'])

'''Also in the dataset we can see that some recrods are like
 Example: Kitchen Qual has values: Ex (Excellent), Gd (Good), TA (Typical), Fa (Fair), Po (Poor). 
 Hence we can't use normal encoding, we must manually map them to numbers'''

quality_map = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'Po': 1,  # Poor
    'None': 0 # Not Present (We filled this in Phase 1)
}

# B. List the columns that use this specific scale
# (Checked against dataset documentation)
ordinal_cols = [
    'Exter Qual',   # Exterior material quality
    'Exter Cond',   # Exterior material condition
    'Bsmt Qual',    # Height of the basement
    'Bsmt Cond',    # General condition of the basement
    'Heating QC',   # Heating quality and condition
    'Kitchen Qual', # Kitchen quality
    'Fireplace Qu', # Fireplace quality
    'Garage Qual',  # Garage quality
    'Garage Cond',  # Garage condition
    'Pool QC'       # Pool quality
]

for col in ordinal_cols:
    if col in df.columns:
        df[col] = df[col].map(quality_map)
        # IMPORTANT: If a value wasn't in our dictionary (like a typo), 
        # map turns it into NaN. We fill those rare cases with 3 (Average).
        df[col] = df[col].fillna(3)

In [6]:
'''Now we deal with the remaining columns
WE just apply one hot encoder to this simple AFF'''

nominal_cols = df.select_dtypes(include=['object']).columns.tolist()
df_encoded = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 1. Prepare Data
# df_encoded is our fully numeric dataframe from Phase 3
X = df_encoded.drop('SalePrice', axis=1)
y = df_encoded['SalePrice'] # Already Log-Transformed in Phase 2

# 2. Split (Standard 80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize Model
# n_estimators=500: More trees = more stable
# n_jobs=-1: Use all CPU cores (faster)
model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)

print("Training Model... (This might take 10-20 seconds)")
model.fit(X_train, y_train)

# 4. Evaluate
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
# Since y is already log-scale, this IS the "Log RMSE"
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

# Calculate R2 just for reference
r2_test = r2_score(y_test, y_pred_test)

print("\n--- RESULTS ---")
print(f"Training RMSE: {rmse_train:.4f} (Lower is better)")
print(f"Test RMSE:     {rmse_test:.4f} (Target: < 0.14)")
print(f"Test R2 Score: {r2_test:.4f}")

# 5. Interpretation (Reverse the Logs to see real dollars)
# We take a sample prediction to see the dollar error
sample_idx = 0
real_price = np.expm1(y_test.iloc[sample_idx])
pred_price = np.expm1(y_pred_test[sample_idx])
error = real_price - pred_price

print("\n--- REALITY CHECK (First House in Test Set) ---")
print(f"Actual Price:    ${real_price:,.2f}")
print(f"Predicted Price: ${pred_price:,.2f}")
print(f"Error:           ${error:,.2f}")

Training Model... (This might take 10-20 seconds)

--- RESULTS ---
Training RMSE: 0.0532 (Lower is better)
Test RMSE:     0.1216 (Target: < 0.14)
Test R2 Score: 0.9201

--- REALITY CHECK (First House in Test Set) ---
Actual Price:    $161,000.00
Predicted Price: $166,318.27
Error:           $-5,318.27


In [9]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# 1. Initialize the Model
# We use a small learning rate (0.05) and many trees (1000) for precision.
xg_reg = xgb.XGBRegressor(
    objective='reg:squarederror', # We are doing regression, not classification
    n_estimators=1000,            # Max number of trees
    learning_rate=0.05,           # "Step size" - smaller is more accurate but slower
    max_depth=5,                  # How deep each tree can grow (prevent overfitting)
    early_stopping_rounds=50,     # Stop if validation score stops improving
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost...")

# 2. Fit with "Early Stopping"
# This requires passing the Test set (eval_set) so it knows when to stop.
# verbose=False keeps the output clean (otherwise it prints 1000 lines).
xg_reg.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# 3. Predict & Evaluate
y_pred_xgb = xg_reg.predict(X_test)
y_train_xgb = xg_reg.predict(X_train)

# Calculate RMSE
rmse_train_xgb = np.sqrt(mean_squared_error(y_train, y_train_xgb))
rmse_test_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print("\n--- XGBoost RESULTS ---")
print(f"Training RMSE: {rmse_train_xgb:.4f}")
print(f"Test RMSE:     {rmse_test_xgb:.4f}")
print(f"Best Iteration: {xg_reg.best_iteration} (It stopped early!)")

# 4. Compare with Random Forest (Previous Best)
# rf_rmse_test was roughly 0.1216
improvement = 0.1216 - rmse_test_xgb
print(f"\nImprovement over Random Forest: {improvement:.4f}")

Training XGBoost...

--- XGBoost RESULTS ---
Training RMSE: 0.0296
Test RMSE:     0.1012
Best Iteration: 684 (It stopped early!)

Improvement over Random Forest: 0.0204
