In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('/content/drive/MyDrive/BrainStation Capstone Data/checkpoint5.csv')

In [4]:
df = df.drop(columns='Unnamed: 0')

In [5]:
df.shape

(4529592, 35)

In [6]:
# Sample 20% of the dataframe
df_sample = df.sample(frac=0.2, random_state=42)

# Define features (X) and target (y)
X = df_sample.drop('median_sale_price', axis=1)
y = df_sample['median_sale_price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the KNN regressor with 100 neighbors
knn_regressor = KNeighborsRegressor(n_neighbors=100)
knn_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 6447105609.757742
R-squared: 0.8364868401751612


In [7]:
X = df.drop(columns='median_sale_price', axis=1)  # Predictor variables
y = df['median_sale_price']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict homes sold using the test set
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - MSE: {mse_lr}, R-squared: {r2_lr}')

Linear Regression - MSE: 5983453454.910358, R-squared: 0.8477071927816605


In [8]:
df.columns

Index(['median_sale_price', 'median_list_price', 'median_ppsf',
       'median_list_ppsf', 'homes_sold', 'inventory', 'months_of_supply',
       'median_dom', 'year', 'month', 'season', 'sale_to_list_ratio',
       'price_growth', 'buyer_utility', 'pending_sales_ratio',
       'sales_success_rate', 'inventory_turnover', 'adjusted_months_supply',
       'political_stance_encoded', 'supply_to_list_ratio',
       'property_type_All Residential',
       'property_type_Single Family Residential', 'state_avg_sale_price',
       'metro_region_inventory_change', 'inventory_to_pending_ratio',
       'rolling_median_sale_price', 'rolling_median_list_price',
       'price_momentum', 'supply_pressure', 'demand_pressure',
       'price_elasticity', 'us_region_East North Central',
       'us_region_Mid-Atlantic', 'us_region_Pacific',
       'us_region_South Atlantic'],
      dtype='object')

In [12]:
# Get the coefficients from the model
coefficients = lr_model.coef_

# Create a DataFrame to display the feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': df.drop(columns='median_sale_price', axis=1).columns,
    'Coefficient': abs(coefficients)
}).sort_values(by='Coefficient', ascending=False)

# Display the absolute values of ech of the coefficients
print(coeff_df)

                                    Feature    Coefficient
0                         median_list_price  129771.792293
10                       sale_to_list_ratio   58836.077717
24                rolling_median_sale_price   57459.279706
25                rolling_median_list_price   42276.999903
1                               median_ppsf   38027.563095
21                     state_avg_sale_price    8887.494092
20  property_type_Single Family Residential    7684.595938
26                           price_momentum    6636.752198
3                                homes_sold    6601.754369
16                   adjusted_months_supply    6129.850824
5                          months_of_supply    6034.040020
19            property_type_All Residential    4989.257755
4                                 inventory    4087.486311
12                            buyer_utility    3711.318729
13                      pending_sales_ratio    3552.555016
11                             price_growth    3038.5536

In [13]:
# let's try to do the same as above but with fewer values to consider!
# first, we are going to find the top 10 features by the values of their
# coefficients in the linear regression model
top10Features = coeff_df.loc[:,'Feature'][:10].values


In [14]:
top10Features

array(['median_list_price', 'sale_to_list_ratio',
       'rolling_median_sale_price', 'rolling_median_list_price',
       'median_ppsf', 'state_avg_sale_price',
       'property_type_Single Family Residential', 'price_momentum',
       'homes_sold', 'adjusted_months_supply'], dtype=object)

In [15]:
# now proceeed to build a linear model with only these features
X = df[top10Features]
y = df['median_sale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
lr_model2 = LinearRegression()
lr_model2.fit(X_train_scaled, y_train)

# Predict homes sold using the test set
y_pred_lr = lr_model2.predict(X_test_scaled)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - MSE: {mse_lr}, R-squared: {r2_lr}')

Linear Regression - MSE: 6067498918.158852, R-squared: 0.8455680402623784


In [16]:
# Select features for PCA
X_pca = df[top10Features]
y_pca = df['median_sale_price']

# Split data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

# Scale features
scaler_pca = StandardScaler()
X_train_pca_scaled = scaler_pca.fit_transform(X_train_pca)
X_test_pca_scaled = scaler_pca.transform(X_test_pca)

# Apply PCA
pca = PCA(n_components=0.9)  # Keep enough components to explain 95% of variance
X_train_pca_reduced = pca.fit_transform(X_train_pca_scaled)
X_test_pca_reduced = pca.transform(X_test_pca_scaled)

# Initialize and train a linear regression model on the reduced data
lr_model_pca = LinearRegression()
lr_model_pca.fit(X_train_pca_reduced, y_train_pca)

# Predict using the test set
y_pred_pca = lr_model_pca.predict(X_test_pca_reduced)

# Evaluate the model
mse_pca = mean_squared_error(y_test_pca, y_pred_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)

print(f'PCA + Linear Regression - MSE: {mse_pca}, R-squared: {r2_pca}')


PCA + Linear Regression - MSE: 10076534167.609993, R-squared: 0.743528768631521


In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the neural network model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=100, validation_split=0.2)

# Evaluate the model on the test set
loss, mae = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Make predictions on the test set
y_pred_nn = model.predict(X_test_scaled)

# Evaluate the model
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f'Neural Network - MSE: {mse_nn}, R-squared: {r2_nn}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 2ms/step - loss: 21837334528.0000 - mae: 85323.5781 - val_loss: 2645536512.0000 - val_mae: 26457.2773
Epoch 2/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2ms/step - loss: 1968100224.0000 - mae: 21592.3984 - val_loss: 523016192.0000 - val_mae: 10137.8975
Epoch 3/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2ms/step - loss: 376962080.0000 - mae: 8614.9639 - val_loss: 137542080.0000 - val_mae: 5481.6704
Epoch 4/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2ms/step - loss: 108800248.0000 - mae: 4950.7192 - val_loss: 53772876.0000 - val_mae: 3665.4087
Epoch 5/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - loss: 46817944.0000 - mae: 3476.6111 - val_loss: 30183306.0000 - val_mae: 3038.8127
Epoch 6/20
[1m28990/28990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - loss: 2

In [19]:
model.summary()