## Importing Dependancies

In [None]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

## Loading our CSV

In [None]:
final_housing_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/02-preprocessed/preprocessed.csv"
)
final_housing_df.head()

## Train, Test and Validation Split

In [None]:
y = final_housing_df["median_house_value"]
X = final_housing_df.drop("median_house_value", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

## Preprocessing to change the scale of the data

In [None]:
X_train.head()

In [None]:
numeric_col = X_train.columns[:8]

scaler = StandardScaler()
scaler.fit(X_train[numeric_col])


def preprocessor(X):
    X_copy = X.copy()
    X_copy[numeric_col] = scaler.transform(X_copy[numeric_col])
    return X_copy


X_train_pre, X_test_pre = preprocessor(X_train), preprocessor(X_test)

In [None]:
pd.DataFrame(X_train_pre)

In [None]:
pd.DataFrame(X_train_pre).hist()

In [None]:
X_train_pre.shape, X_test_pre.shape

## Saving the train and Test dataframes in the 03-features data folder

In [None]:
# Define folder
folder_path = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/03-features"
)

# Make folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define filenames
train_file = os.path.join(folder_path, "train_preprocessed.csv")
test_file = os.path.join(folder_path, "test_preprocessed.csv")

# Save preprocessed DataFrames
X_train_pre.to_csv(train_file, index=False)
X_test_pre.to_csv(test_file, index=False)

# Optional: print confirmation
print("Train and test sets saved successfully!")

## LinearRegression

In [None]:
# Initialize and fit model
lm = LinearRegression()
lm.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = lm.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

## k-fold for Linear Regression

#### Step 1 Define RMSE Scorer

In [None]:
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

#### Step 2 Create K-Fold splitter

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#### Step 3 Evaluate model with CV

In [None]:
lm = LinearRegression()
cv_scores = cross_val_score(lm, X_train_pre, y_train, cv=kf, scoring=rmse_scorer)

print("RMSE for each fold:", cv_scores)
print("Mean RMSE:", cv_scores.mean())
print("Std RMSE:", cv_scores.std())

## K-Nearest Neighbor

In [None]:
# Initialize and fit model
knn = KNeighborsRegressor(n_neighbors=15)
knn.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = knn.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Running cross validation

In [None]:
# use the string shortcut
knn_scores = cross_val_score(knn, X_train_pre, y_train, scoring=rmse_scorer, cv=10)

knn_rmse_scores = -knn_scores  # Flip the sign to make it positive

print("KNN Cross-Validation Mean RMSE:", knn_rmse_scores.mean())
print("KNN Cross-Validation Std:", knn_rmse_scores.std())

#### run a GridSearchCV to automatically find the best number of neighbors

In [None]:
# 1. Define the parameter values we want to try
param_grid = [{"n_neighbors": [2, 5, 10, 25, 35], "weights": ["uniform", "distance"]}]

# 2. Set up the search
grid_search = GridSearchCV(
    KNeighborsRegressor(), param_grid, cv=5, scoring=rmse_scorer, return_train_score=True
)

# 3. Fit the search (this will take a moment)
grid_search.fit(X_train_pre, y_train)

# 4. Get the results
print("Best Params:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

## Random Forest Regressor

In [None]:
# Initialize and fit model
rfr = RandomForestRegressor(max_depth=6)
rfr.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = rfr.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Running cross validation

In [None]:
# 1. Check the "Real" performance with Cross Validation
rfr_scores = cross_val_score(
    rfr, X_train_pre, y_train, scoring="neg_root_mean_squared_error", cv=10
)

rfr_rmse_scores = -rfr_scores  # Flip sign to positive

print("Random Forest Cross-Val Mean RMSE:", rfr_rmse_scores.mean())
print("Random Forest Cross-Val Std:", rfr_rmse_scores.std())
print("Gap (Overfitting):", rfr_rmse_scores.mean() - rmse_train)

#### run a GridSearchCV to automatically find the best number of max depth

In [None]:
# 1. Define the parameters to test
# We test specific limits [5, 10, 20] and 'None' (unlimited depth)
param_grid = [
    {
        "max_depth": [5, 10, 15, 20, 30, None],
        "n_estimators": [100],
    }  # Keeping estimators constant for now
]

# 2. Set up the search
# n_jobs=-1 uses all your CPU cores to speed up calculation
forest_grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    return_train_score=True,
    n_jobs=-1,
)

# 3. Fit the search
forest_grid_search.fit(X_train_pre, y_train)

# 4. Results
print("Best Max Depth:", forest_grid_search.best_params_["max_depth"])
print("Best Cross-Val RMSE:", -forest_grid_search.best_score_)

## Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(n_estimators=30)
gbr.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = gbr.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Run with cross validation

In [None]:
# 1. Check the "Real" performance with Cross Validation
gbr_scores = cross_val_score(gbr, X_train_pre, y_train, scoring=rmse_scorer, cv=10)

gbr_rmse_scores = -gbr_scores  # Flip sign to positive

print("Gradient Boosting Cross-Val Mean RMSE:", gbr_rmse_scores.mean())
print("Gradient Boosting Cross-Val Std:", gbr_rmse_scores.std())
print("Gap (Overfitting):", gbr_rmse_scores.mean() - rmse_train)

#### Grid Search

In [None]:
# 1. Define the grid
# We want to see if 'Slow & Steady' (0.01 + 300) beats 'Fast & Aggressive' (0.3 + 30)
param_grid = [
    {
        "n_estimators": [30, 100, 300, 500],
        "learning_rate": [0.01, 0.1, 0.3],
        "max_depth": [3],  # Standard default for Boosting is shallow trees (3)
    }
]

# 2. Set up the search
gb_grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1,  # Use all cores
)

# 3. Fit the search
print("Running Grid Search... (this may take a minute)")
gb_grid_search.fit(X_train_pre, y_train)

# 4. Results
print("\n--- Results ---")
print("Best Params:", gb_grid_search.best_params_)
print("Best Cross-Val RMSE:", -gb_grid_search.best_score_)