In [1]:
from homeharvest import scrape_property
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from numpy import ravel
import statsmodels.api as sm
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
notebook_dir = os.getcwd()


In [3]:
pd.set_option('display.max_columns', 500)

os.chdir(notebook_dir)
os.chdir('..')
df1 = pd.read_csv("data/Austin_For_Sale_CLEANED.csv")
df2 = pd.read_csv("data/Austin_Sold_CLEANED.csv")

In [None]:
encoder = LabelEncoder()
season_quant = encoder.fit_transform(ravel(df2['season']))
df2['season_listed_num'] = season_quant

In [None]:
X = df2[['beds','full_baths', 'half_baths' ,'sqft','lot_sqft','hoa_fee','parking_garage','list_price','pricepersqft','bedcostratio', 'bathcostratio', 'bathbedcastratio', 'amenitiescastratio','season_listed_num'
       ]]
y = df2['days_on_market_quartile']
encoder = LabelEncoder()
y = encoder.fit_transform(ravel(y))
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [None]:
# Create a pipeline with a standard scaler and a KNN regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'knn__n_neighbors': [1,5,10,15,20,25,30,35,40,45,50,100],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}
# Create GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse}")

In [None]:
features = ['list_price','pricepersqft','beds','full_baths', 'half_baths','bedcostratio', 'bathcostratio', 'bathbedcastratio','amenitiescastratio','hashoa',"season_listed_num",'days_on_market_quartile']
sns.pairplot(df2[features], hue="days_on_market_quartile")

In [None]:
# Extract the results from GridSearchCV
results = grid_search.cv_results_

# Convert results to a DataFrame for easier plotting
results_df = pd.DataFrame(results)

# Pivot the DataFrame to have a heatmap structure
pivot_table = results_df.pivot_table(
    index='param_knn__n_neighbors',
    columns='param_knn__weights',
    values='mean_test_score'
)

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='viridis')
plt.title('Grid Search Mean Test Scores')
plt.xlabel('Weights')
plt.ylabel('Number of Neighbors')
plt.show()

In [None]:
# Plot the predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [None]:

pipe = Pipeline([
('scaler', StandardScaler()),
('selector', VarianceThreshold()),
('classifier', KNeighborsClassifier())
])

parameters = {'scaler': [StandardScaler(), MinMaxScaler(),
 Normalizer(), MaxAbsScaler(),PowerTransformer(),QuantileTransformer(),RobustScaler()],
 'selector__threshold': [0, 0.001, 0.01],
 'classifier__n_neighbors': [1, 3, 5, 7, 10,30,50],
 'classifier__p': [1, 2],
 'classifier__leaf_size': [1, 5, 10, 15]
}

In [None]:
grid = GridSearchCV(pipe, parameters, cv=2).fit(X_train, y_train)

print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))

In [None]:
# Access the best set of parameters
best_params = grid.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(best_pipe)

In [None]:
result_df = pd.DataFrame.from_dict(grid.cv_results_, orient='columns')
print(result_df.columns)

In [None]:
sns.relplot(data=result_df,
 kind='line',
 x='param_classifier__n_neighbors',
 y='mean_test_score',
 hue='param_scaler',
 col='param_classifier__p')
plt.show()

In [None]:
best_model = grid.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

In [None]:
# Plot the predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.show()

In [None]:
matches = y_pred == y_test
percentage_same = np.sum(matches) / len(y_pred) * 100

print(f"The arrays are {percentage_same:.2f}% the same.")

In [None]:
features = ['list_price_quartile','pricepersqft','beds','full_baths', 'half_baths','bedcostratio', 'bathcostratio', 'bathbedcastratio','amenitiescastratio','hashoa',"season_listed_num",'days_on_market_quartile']
sns.pairplot(df2[features], hue="list_price_quartile")
