## Importing Dependancies

In [None]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

## Loading our CSV

In [None]:
final_housing_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/02-preprocessed/preprocessed.csv"
)
final_housing_df.head()

## Train, Test and Validation Split

In [None]:
y = final_housing_df["median_house_value"]
X = final_housing_df.drop("median_house_value", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

## Preprocessing to change the scale of the data

In [None]:
X_train.head()

In [None]:
numeric_col = X_train.columns[:8]

scaler = StandardScaler()
scaler.fit(X_train[numeric_col])


def preprocessor(X):
    X_copy = X.copy()
    X_copy[numeric_col] = scaler.transform(X_copy[numeric_col])
    return X_copy


X_train_pre, X_test_pre = preprocessor(X_train), preprocessor(X_test)

In [None]:
pd.DataFrame(X_train_pre)

In [None]:
pd.DataFrame(X_train_pre).hist()

In [None]:
X_train_pre.shape, X_test_pre.shape

## Saving the train and Test dataframes in the 03-features data folder

In [None]:
# Define folder
folder_path = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/03-features"
)

# Make folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define filenames
train_file = os.path.join(folder_path, "train_preprocessed.csv")
test_file = os.path.join(folder_path, "test_preprocessed.csv")

# Save preprocessed DataFrames
X_train_pre.to_csv(train_file, index=False)
X_test_pre.to_csv(test_file, index=False)

# Optional: print confirmation
print("Train and test sets saved successfully!")

## LinearRegression

In [None]:
# Initialize and fit model
lm = LinearRegression()
lm.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = lm.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

## k-fold for Linear Regression

#### Step 1 Define RMSE Scorer

In [None]:
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

#### Step 2 Create K-Fold splitter

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#### Step 3 Evaluate model with CV

In [None]:
lm = LinearRegression()
cv_scores = cross_val_score(lm, X_train_pre, y_train, cv=kf, scoring=rmse_scorer)

print("RMSE for each fold:", cv_scores)
print("Mean RMSE:", cv_scores.mean())
print("Std RMSE:", cv_scores.std())