In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/content/drive/MyDrive/BrainStation Capstone Data/checkpoint5.csv')

In [None]:
df = df.drop(columns='Unnamed: 0')

In [None]:
df.shape

(4529592, 35)

In [None]:
# Sample 20% of the dataframe
df_sample = df.sample(frac=0.2, random_state=42)

# Define features (X) and target (y)
X = df_sample.drop('homes_sold', axis=1)
y = df_sample['homes_sold']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the KNN regressor with 100 neighbors
knn_regressor = KNeighborsRegressor(n_neighbors=100)
knn_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 68.37760001263169
R-squared: 0.8350600550735247


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, np.round(y_pred))
r2 = r2_score(y_test, np.round(y_pred))

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 68.44313085145164
R-squared: 0.8349019820650547


In [None]:
X = df.drop(columns='homes_sold', axis=1)  # Predictor variables
y = df['homes_sold']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict homes sold using the test set
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - MSE: {mse_lr}, R-squared: {r2_lr}')

Linear Regression - MSE: 96.38731356139502, R-squared: 0.765715423767509


In [None]:
df.columns

Index(['median_sale_price', 'median_list_price', 'median_ppsf',
       'median_list_ppsf', 'homes_sold', 'inventory', 'months_of_supply',
       'median_dom', 'year', 'month', 'season', 'sale_to_list_ratio',
       'price_growth', 'buyer_utility', 'pending_sales_ratio',
       'sales_success_rate', 'inventory_turnover', 'adjusted_months_supply',
       'political_stance_encoded', 'supply_to_list_ratio',
       'property_type_All Residential',
       'property_type_Single Family Residential', 'state_avg_sale_price',
       'metro_region_inventory_change', 'inventory_to_pending_ratio',
       'rolling_median_sale_price', 'rolling_median_list_price',
       'price_momentum', 'supply_pressure', 'demand_pressure',
       'price_elasticity', 'us_region_East North Central',
       'us_region_Mid-Atlantic', 'us_region_Pacific',
       'us_region_South Atlantic'],
      dtype='object')

In [None]:
# Get the coefficients from the model
coefficients = lr_model.coef_

# Create a DataFrame to display the feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': df.drop(columns='homes_sold', axis=1).columns,
    'Coefficient': abs(coefficients)
}).sort_values(by='Coefficient', ascending=False)

# Display the absolute values of ech of the coefficients
print(coeff_df)

                                    Feature  Coefficient
4                                 inventory    17.459474
5                          months_of_supply     5.407114
16                   adjusted_months_supply     4.139095
27                          supply_pressure     2.086777
15                       inventory_turnover     1.819811
19            property_type_All Residential     1.643711
18                     supply_to_list_ratio     1.559014
20  property_type_Single Family Residential     1.404602
22            metro_region_inventory_change     1.347591
23               inventory_to_pending_ratio     1.271722
14                       sales_success_rate     1.144753
0                         median_sale_price     1.033690
13                      pending_sales_ratio     0.671117
1                         median_list_price     0.661901
28                          demand_pressure     0.577537
21                     state_avg_sale_price     0.544368
31                   us_region_

In [None]:
# let's try to do the same as above but with fewer values to consider!
# first, we are going to find the top 10 features by the values of their
# coefficients in the linear regression model
top10Features = coeff_df.loc[:,'Feature'][:10].values


In [None]:
top10Features

array(['inventory', 'months_of_supply', 'adjusted_months_supply',
       'supply_pressure', 'inventory_turnover',
       'property_type_All Residential', 'supply_to_list_ratio',
       'property_type_Single Family Residential',
       'metro_region_inventory_change', 'inventory_to_pending_ratio'],
      dtype=object)

In [None]:
# now proceeed to build a linear model with only these features
X = df[top10Features]
y = df['homes_sold']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
lr_model2 = LinearRegression()
lr_model2.fit(X_train_scaled, y_train)

# Predict homes sold using the test set
y_pred_lr = lr_model2.predict(X_test_scaled)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - MSE: {mse_lr}, R-squared: {r2_lr}')

Linear Regression - MSE: 99.4170813961846, R-squared: 0.7583510949255771


In [None]:
# Select features for PCA
X_pca = df[top10Features]
y_pca = df['homes_sold']

# Split data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

# Scale features
scaler_pca = StandardScaler()
X_train_pca_scaled = scaler_pca.fit_transform(X_train_pca)
X_test_pca_scaled = scaler_pca.transform(X_test_pca)

# Apply PCA
pca = PCA(n_components=0.9)  # Keep enough components to explain 95% of variance
X_train_pca_reduced = pca.fit_transform(X_train_pca_scaled)
X_test_pca_reduced = pca.transform(X_test_pca_scaled)

# Initialize and train a linear regression model on the reduced data
lr_model_pca = LinearRegression()
lr_model_pca.fit(X_train_pca_reduced, y_train_pca)

# Predict using the test set
y_pred_pca = lr_model_pca.predict(X_test_pca_reduced)

# Evaluate the model
mse_pca = mean_squared_error(y_test_pca, y_pred_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)

print(f'PCA + Linear Regression - MSE: {mse_pca}, R-squared: {r2_pca}')


PCA + Linear Regression - MSE: 115.69286417946645, R-squared: 0.7187902364335057


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the neural network model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=100, validation_split=0.2)

# Evaluate the model on the test set
loss, mae = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Make predictions on the test set
y_pred_nn = model.predict(X_test_scaled)

# Evaluate the model
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f'Neural Network - MSE: {mse_nn}, R-squared: {r2_nn}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 2ms/step - loss: 13.2251 - mae: 0.8562 - val_loss: 0.2262 - val_mae: 0.2967
Epoch 2/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 2ms/step - loss: 0.0853 - mae: 0.1642 - val_loss: 0.0489 - val_mae: 0.1195
Epoch 3/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2ms/step - loss: 0.0667 - mae: 0.1408 - val_loss: 0.0345 - val_mae: 0.1097
Epoch 4/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 2ms/step - loss: 0.0518 - mae: 0.1237 - val_loss: 0.1647 - val_mae: 0.2868
Epoch 5/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2ms/step - loss: 0.0475 - mae: 0.1149 - val_loss: 0.0229 - val_mae: 0.0898
Epoch 6/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 2ms/step - loss: 0.0408 - mae: 0.1075 - val_loss: 0.0187 - val_mae: 0.0861
Epoch 7/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━