In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
 
sns.set_style('whitegrid')

In [2]:
# Install the Kaggle library
%pip install -q kaggle

print("\nKaggle API configured successfully.")

Note: you may need to restart the kernel to use updated packages.

Kaggle API configured successfully.


In [3]:
!kaggle datasets list

ref                                                                  title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-------------------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
saidaminsaidaxmadov/chocolate-sales                                  Chocolate Sales                                         468320  2026-01-04 14:23:35.490000              0         89                1  
neurocipher/heartdisease                                             Heart Disease                                             3491  2025-12-11 15:29:14.327000           2114        372                1  
rockyt07/social-media-user-analysis                                  Social Media User Analysis                           247842357  2026-01-14 02:28:41.970000              0      

In [4]:
# Download the dataset from the Kaggle competition
!kaggle competitions download -c house-prices-advanced-regression-techniques

# Unzip the downloaded files
!unzip -o house-prices-advanced-regression-techniques.zip

print("\nDataset downloaded and unzipped.")

house-prices-advanced-regression-techniques.zip: Skipping, found more recently modified local copy (use --force to force download)

Dataset downloaded and unzipped.


'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
# Now, load the data from the unzipped CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Set the 'Id' column as the index for consistency
train_df.set_index('Id', inplace=True)
test_df.set_index('Id', inplace=True)

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Training data shape: (1460, 80)
Testing data shape: (1459, 79)


In [6]:
# Separate target
y = train_df['SalePrice']
X = train_df.drop('SalePrice', axis=1)

# Separate numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Fill missing values
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
X[cat_cols] = X[cat_cols].fillna('None')


In [7]:
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_val)

print("MAE:", mean_absolute_error(y_val, y_pred))
print("RMSE:", mean_squared_error(y_val, y_pred) ** 0.5)
print("R2:", r2_score(y_val, y_pred))


MAE: 16765.31640625
RMSE: 26086.137928026066
R2: 0.9112832546234131


In [10]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42
    ),
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


In [11]:
X['TotalArea'] = X['GrLivArea'] + X['TotalBsmtSF']
X['Bathrooms'] = X['FullBath'] + 0.5 * X['HalfBath']


In [12]:
lr = LinearRegression()
xgb_best = grid.best_estimator_

lr.fit(X_train, y_train)
xgb_best.fit(X_train, y_train)

lr_pred = lr.predict(X_val)
xgb_pred = xgb_best.predict(X_val)

ensemble_pred = (lr_pred + xgb_pred) / 2

print("Ensemble R2:", r2_score(y_val, ensemble_pred))

Ensemble R2: 0.8945275342345452


In [13]:

# Calculate Mean Absolute Percentage Error (MAPE)
def calculate_accuracy(y_true, y_pred):
    # Avoid division by zero
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    accuracy = 100 - mape
    return accuracy

# Calculate accuracy using your 'y_val' (actual prices) and 'ensemble_pred' (predicted prices)
model_accuracy = calculate_accuracy(y_val, ensemble_pred)

print(f"Model Accuracy based on Error Percentage: {model_accuracy:.2f}%")
print(f"R¬≤ Score (Goodness of Fit): {r2_score(y_val, ensemble_pred):.4f}")

Model Accuracy based on Error Percentage: 89.62%
R¬≤ Score (Goodness of Fit): 0.8945


# Conclusion

In this project, we successfully built a machine learning model to predict house prices. Here is a summary of our workflow:

1.  **Data Loading & Preprocessing**:
    - Loaded the House Prices dataset directly from Kaggle.
    - Handled missing data by filling numerical missing values with the **median** and categorical ones with a placeholder.
    - Converted categorical text data into numbers using **Label Encoding**.

2.  **Feature Engineering**:
    - Created new insightful features to help the model learn better:
        - `TotalArea`: Combined living area and basement area.
        - `Bathrooms`: Aggregated full and half bathrooms.

3.  **Model Development**:
    - **XGBoost Regressor**: Implemented a gradient boosting model as our primary predictor.
    - **Hyperparameter Tuning**: Used `GridSearchCV` to optimize critical parameters like `learning_rate`, `max_depth`, and `n_estimators`.
    - **Ensemble Approach**: Combined the predictions of **Linear Regression** and the tuned **XGBoost** model to balance the results and improve generalization.

4.  **Results**:
    - The model achieved a high **R¬≤ Score** (approx. **0.89 - 0.91**), indicating that it can explain about 90% of the relevant variance in house prices.