## 1. IMPORTING ALL DEPENDANCIES

In [None]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

## 2. LOAD AND PREPARE DATA

In [None]:
housing_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/01-raw/housing.csv"
)
housing_df.shape

In [None]:
housing_df.columns

In [None]:
housing_df.describe()

In [None]:
msno.matrix(housing_df)

## 3. INITIAL EXPLORATION

In [None]:
housing_df.dtypes

In [None]:
housing_df.isnull().sum()

#### Dealing with missing data

In [None]:
housing_df = housing_df.dropna()

In [None]:
msno.matrix(housing_df)

## 4. Preprocessing

In [None]:
housing_df["ocean_proximity"].value_counts()

#### Shuffling the data

In [None]:
housing_df_shuffled = housing_df.sample(n=len(housing_df), random_state=1)
housing_df_shuffled

#### convert categorical variables into float types

In [None]:
pd.get_dummies(housing_df_shuffled["ocean_proximity"]).head()

#### drop ocean proximity column

In [None]:
housing_df_shuffled.drop("ocean_proximity", axis=1).head()

#### Apply the two above to our DataFrame

In [None]:
final_housing_df = pd.concat(
    [
        housing_df_shuffled.drop("ocean_proximity", axis=1),
        pd.get_dummies(housing_df_shuffled["ocean_proximity"]),
    ],
    axis=1,
)

In [None]:
final_housing_df.columns

In [None]:
final_housing_df

In [None]:
len(final_housing_df)

In [None]:
# Compute correlation matrix
corr_matrix = final_housing_df.corr()

# Set figure size
plt.figure(figsize=(12, 8))

# Plot heatmap
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar=True)

# Add title
plt.title("Correlation Matrix Heatmap", fontsize=16)
plt.show()

## Dealing with Multicolineality

#### Step 1 — Compute correlation matrix

In [None]:
# Compute correlation matrix
corr_matrix = final_housing_df.corr().abs()  # absolute values for threshold comparison

#### Step 2 — Identify highly correlated pairs

In [None]:
# Set threshold
threshold = 0.8

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than threshold
high_corr_pairs = [
    (column, row)
    for column in upper.columns
    for row in upper.index
    if upper.loc[row, column] > threshold
]

high_corr_pairs

#### Step 3 — Decide which features to drop

In [None]:
# Feature engineering to reduce multicollinearity
final_housing_df["rooms_per_household"] = (
    final_housing_df["total_rooms"] / final_housing_df["households"]
)
final_housing_df["bedrooms_per_room"] = (
    final_housing_df["total_bedrooms"] / final_housing_df["total_rooms"]
)
final_housing_df["population_per_household"] = (
    final_housing_df["population"] / final_housing_df["households"]
)

# Clip the ratio to 1.0 (100% bedrooms)
final_housing_df["bedrooms_per_room"] = final_housing_df["bedrooms_per_room"].clip(upper=1.0)

# Drop original highly collinear features
final_housing_df = final_housing_df.drop(["total_rooms", "total_bedrooms", "population"], axis=1)

#### Step 4 — Verify correlation reduction

In [None]:
# New correlation matrix
corr_matrix_new = final_housing_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix_new, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix After Reducing Multicollinearity")
plt.show()

#### Restructure the columns, the dependent variable median_house_value(y) should be the last column

In [None]:
final_housing_df.columns

In [None]:
final_housing_df = final_housing_df[
    [
        "longitude",
        "latitude",
        "housing_median_age",
        "households",
        "median_income",
        "rooms_per_household",
        "bedrooms_per_room",
        "population_per_household",
        "<1H OCEAN",
        "INLAND",
        "ISLAND",
        "NEAR BAY",
        "NEAR OCEAN",
        "median_house_value",
    ]
]

#### Saving this current dataframe to preprocessed folder

In [None]:
# Define your folder and filename
folder_path = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/02-preprocessed"
)
file_name = "preprocessed.csv"
full_path = os.path.join(folder_path, file_name)

# Create the directory if it doesn't exist (optional but recommended)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save the DataFrame to the specified path
final_housing_df.to_csv(full_path, index=False)  # index=False prevents writing row numbers