In [None]:
# Cell 1. installs and imports
# run this first cell in Colab
!pip install -q kagglehub

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

sns.set(style='whitegrid')


In [None]:
KAGGLEHUB_PATH = "/root/.cache/kagglehub/datasets/harishkumardatalab/housing-price-prediction/versions/1"

files = os.listdir(KAGGLEHUB_PATH)
print("files in dataset folder:", files)

csv_files = [f for f in files if f.lower().endswith(".csv")]
if not csv_files:
    raise Exception("no csv found in the dataset folder. list files above and set path correctly")

csv_path = os.path.join(KAGGLEHUB_PATH, csv_files[0])
print("using csv:", csv_path)

house = pd.read_csv(csv_path)
house.shape, house.columns.tolist()

In [None]:
import pandas as pd
import os

# this is the path KaggleHub gave you
path = "/root/.cache/kagglehub/datasets/harishkumardatalab/housing-price-prediction/versions/1"

# list files to see what's inside
files = os.listdir(path)
print("Files inside dataset folder:", files)

# auto-detect the csv file
csv_files = [f for f in files if f.endswith(".csv")]

if len(csv_files) == 0:
    raise Exception("No CSV file found in dataset folder.")

csv_path = os.path.join(path, csv_files[0])
print("Using file:", csv_path)

# load dataset
house = pd.read_csv(csv_path)

# preview
house.head()


In [None]:
# . basic info, missing values
print("shape:", house.shape)
print("\nmissing per column:")
print(house.isnull().sum().sort_values(ascending=False).head(30))

print("\ndtypes:")
print(house.dtypes)


In [None]:
# target detection and quick fixes
# common target names, add more if needed
possible_targets = ['price', 'Price', 'SalePrice', 'sale_price', 'Sale_Price', 'house_price']
target = None
for t in possible_targets:
    if t in house.columns:
        target = t
        break

# fallback: if dataset has exactly one numeric column that looks like price name, attempt heuristics
if target is None:
    # try column with highest variance and numeric as a guess for target
    numeric_cols = house.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) > 0:
        guess = house[numeric_cols].var().sort_values(ascending=False).index[0]
        target = guess
        print("no obvious price column found, guessing target:", target)
    else:
        raise Exception("no numeric columns found to act as target")

print("target column set to:", target)

# drop obviously useless columns if present
drop_cols = [c for c in ['Id', 'id', 'ID', 'index'] if c in house.columns]
if drop_cols:
    house.drop(columns=drop_cols, inplace=True)
    print("dropped:", drop_cols)


In [None]:
#Exploratory visuals (small, quick)
# correlation heatmap of numeric features
num = house.select_dtypes(include=[np.number]).copy()
plt.figure(figsize=(10,8))
sns.heatmap(num.corr(), cmap='coolwarm', center=0)
plt.title("numeric feature correlation")
plt.show()

# distribution of target
plt.figure(figsize=(6,4))
sns.histplot(house[target].dropna(), kde=True)
plt.title("target distribution")
plt.show()


In [None]:
# preprocessing plan
# - separate numeric and categorical
# - impute missing values
# - power transform target if strongly skewed
# - one-hot encode categorical
# - scale numeric features

# simple heuristics for identifying columns
numeric_features = house.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_features:
    numeric_features.remove(target)
categorical_features = house.select_dtypes(include=['object', 'category']).columns.tolist()

print("numeric features:", len(numeric_features))
print("categorical features:", len(categorical_features))


In [None]:
# build preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  pd.get_dummies) # placeholder, will handle separately
])

# Because sklearn ColumnTransformer can't use pandas.get_dummies directly, we'll do get_dummies first for simplicity
house_prep = house.copy()

# impute simple missing numeric values with median now, because get_dummies changes structure
for col in numeric_features:
    if house_prep[col].isnull().any():
        med = house_prep[col].median()
        house_prep[col].fillna(med, inplace=True)

# fill categorical missing with 'missing'
for col in categorical_features:
    house_prep[col].fillna('missing', inplace=True)

# one-hot encode categoricals
if categorical_features:
    house_prep = pd.get_dummies(house_prep, columns=categorical_features, drop_first=True)

print("shape after one-hot:", house_prep.shape)


In [None]:
#final X, y, optional log transform target if skewed
X = house_prep.drop(columns=[target])
y = house_prep[target].copy()

# examine skew
skewness = y.skew()
print("target skewness:", skewness)
apply_log = False
if skewness > 1.0:
    # power transform target for better regression performance
    y = np.log1p(y)
    apply_log = True
    print("applied log1p transform to target")

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("train shape", X_train.shape, "test shape", X_test.shape)


In [None]:
#benchmark model: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
preds_lr = lr.predict(X_test)

# if we transformed target, invert for metrics display
def invert_target(arr):
    return np.expm1(arr) if apply_log else arr

y_test_inv = invert_target(y_test)
preds_lr_inv = invert_target(preds_lr)

print("Linear Regression metrics:")
print("MAE:", mean_absolute_error(y_test_inv, preds_lr_inv))
print("RMSE:", np.sqrt(mean_squared_error(y_test_inv, preds_lr_inv)))
print("R2:", r2_score(y_test_inv, preds_lr_inv))

In [None]:
#Gradient Boosting with basic CV tuning
gbr = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

grid = GridSearchCV(gbr, param_grid, cv=4, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("best params:", grid.best_params_)
best_model = grid.best_estimator_

# predict and invert
preds_gbr = best_model.predict(X_test)
preds_gbr_inv = invert_target(preds_gbr)

print("\nGradient Boosting metrics on test set:")
print("MAE:", mean_absolute_error(y_test_inv, preds_gbr_inv))
print("RMSE:", np.sqrt(mean_squared_error(y_test_inv, preds_gbr_inv)))
print("R2:", r2_score(y_test_inv, preds_gbr_inv))

In [None]:
#RandomForest for comparison
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)
preds_rf_inv = invert_target(preds_rf)

print("Random Forest metrics:")
print("MAE:", mean_absolute_error(y_test_inv, preds_rf_inv))
print("RMSE:", np.sqrt(mean_squared_error(y_test_inv, preds_rf_inv)))
print("R2:", r2_score(y_test_inv, preds_rf_inv))


In [None]:
#actual vs predicted plot for best model (gbr)
plt.figure(figsize=(7,6))
plt.scatter(y_test_inv, preds_gbr_inv, alpha=0.6)
plt.plot([y_test_inv.min(), y_test_inv.max()], [y_test_inv.min(), y_test_inv.max()], linestyle='--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted - Gradient Boosting")
plt.show()


In [None]:
# feature importance (top 20)
if hasattr(best_model, 'feature_importances_'):
    fi = best_model.feature_importances_
    feat_names = X.columns
    fi_df = pd.DataFrame({'feature': feat_names, 'importance': fi})
    fi_df = fi_df.sort_values('importance', ascending=False).head(20)
    plt.figure(figsize=(8,6))
    sns.barplot(data=fi_df, x='importance', y='feature')
    plt.title("Top 20 Feature Importances")
    plt.show()
else:
    print("model has no feature_importances_ attribute")


In [None]:
# save the trained best model and the preprocessing metadata
MODEL_OUT = "gbr_model.joblib"
joblib.dump({
    'model': best_model,
    'columns': X.columns.tolist(),
    'apply_log_target': apply_log
}, MODEL_OUT)
print("saved model artifact to", MODEL_OUT)


In [None]:
# quick notes to add in your notebook or README
notes = """
Task 6 completed, notes:
- loaded KaggleHub csv directly from path
- handled missing values with medians for numeric and 'missing' for categoricals
- one-hot encoded categorical variables
- optional log1p transform applied to target if skewed
- evaluated LinearRegression, GradientBoosting and RandomForest
- used GridSearchCV to tune GBR hyperparameters
- saved best model to gbr_model.joblib
"""
print(notes)
