In [None]:
import numpy as np

from sklearn.model_selection import train_test_split

from utils.data_cleaning import load_and_clean
from utils.models import fit_tune_predict_visualize
from utils._config import *

## Load in the Data

In [None]:
df = load_and_clean(verbose=True)

In [None]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

X.shape, y.shape

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(n_jobs=-1)
param_grid = {
    'model_selection__fit_intercept': [True, False],
}

fit_tune_predict_visualize(
    model_name="Linear Regression",
    model=model,
    df=df,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    gscv_param_grid=param_grid,
)

## K Nearest Neighbors Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_jobs=-1)
param_grid = {
    'model_selection__n_neighbors': [3, 7, 11, 15],  # Number of neighbors
    'model_selection__weights': ['uniform', 'distance'],  # Weighting method
    'model_selection__p': [1, 2]  # Distance metric (Manhattan or Euclidean)
}
param_distributions = {
    'model_selection__n_neighbors': np.arange(1, 31),  # Randomly sample neighbors
    'model_selection__weights': ['uniform', 'distance'],  # Weighting method
    'model_selection__p': [1, 2]  # Distance metric
}

fit_tune_predict_visualize(
    model_name="K Nearest Neighbors",
    model=model,
    df=df,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    gscv_param_grid=param_grid,
    rscv_param_dist=param_distributions,
)

## Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=RANDOM_STATE)
param_grid = {
    'model_selection__max_depth': [None, 3, 5, 10],  # Maximum depth of the tree
    'model_selection__min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'model_selection__min_samples_leaf': [1, 2, 4],  # Minimum samples in a leaf node
    'model_selection__max_features': [None, 'sqrt', 'log2'],  # Number of features to consider
    
}
param_distributions = {
    'model_selection__max_depth': [None] + list(range(5, 31, 5)),  # Maximum depth of the tree
    'model_selection__min_samples_split': np.arange(2, 11),  # Minimum samples to split a node
    'model_selection__min_samples_leaf': np.arange(1, 11),  # Minimum samples in a leaf node
    'model_selection__max_features': [None, 'sqrt', 'log2'],  # Number of features to consider
    
}

fit_tune_predict_visualize(
    model_name="Decision Tree",
    model=model,
    df=df,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    gscv_param_grid=param_grid,
    rscv_param_dist=param_distributions,
)

## Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(random_state=RANDOM_STATE)
param_grid = {
    'model_selection__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],  # Regularization strength
    'model_selection__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],  # Balance between L1 and L2
}
param_distributions = {
    'model_selection__alpha': np.logspace(-4, 1, 20),  # Regularization strength (log scale)
    'model_selection__l1_ratio': np.linspace(0.1, 1.0, 10),  # Balance between L1 and L2
}

fit_tune_predict_visualize(
    model_name="Elastic Net",
    model=model,
    df=df,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    gscv_param_grid=param_grid,
    rscv_param_dist=param_distributions,
)