In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Read the CSV file
df = pd.read_csv('realtor-data.zip.csv')

#seed
seed = 481
np.random.seed(seed)

# Sample 10% of the rows since dataset is big
df_sampled = df.sample(frac=0.1, random_state=1)

In [2]:
# Clead data function by Katherine Wei
def clean_data(df):
    """
    cleans the input dataFrame, written by Katherine Wei

    Parameters:
    df (pd.DataFrame): The input DataFrame to clean.

    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    df_cleaned = df.copy()
    df_cleaned['status'] = df_cleaned['status'].apply(lambda x: 1 if x == 'for_sale' else 0)
    df_cleaned = df_cleaned[df_cleaned['status'] == 1]

    total_observations = len(df_cleaned)
    print("\nTotal number of observations (rows):", total_observations)
    print("\nMax Missing values in one column:", max(df_cleaned.isnull().sum()))

    df_cleaned.drop(columns=['prev_sold_date'], inplace=True)
    df_cleaned.dropna(inplace=True)
    df_cleaned.drop(columns=['street'], inplace=True)

    total_observations_after = len(df_cleaned)
    print("\nTotal number of observations (rows) After dropping rows with missing values:", total_observations_after)

    print(df_cleaned.head())
    return df_cleaned


In [3]:
# Define the OLS objective function
def ols_func(beta, X, y):
    """
    Objective function for OLS.
    """
    return np.sum((y - X @ beta) ** 2)

# Define the function to estimate OLS parameters
def estimate_ols(y: np.array, X: np.array) -> np.array:
    """
    estimates the OLS parameters
    
    Parameters:
    y (np.array): y
    X (np.array): x_{i1}, x_{i2}, x_{i3}
    
    Returns:
    np.array: estimated coefficients beta_0, beta_1, beta_2, beta_3
    """
    # Adding a column of ones to X to account for the intercept
    X_inters = np.hstack([np.ones((X.shape[0], 1)), X])
    
    # Initial guess for beta (including the intercept term)
    initial_beta = np.zeros(X_inters.shape[1])
    
    # Minimize the OLS objective function
    result = minimize(ols_func, initial_beta, args=(X_inters, y.flatten()), method='Nelder-Mead')
    
    return result.x

# Define the function to predict the price
def predict_price(features, beta):
    """
    Predicts the price given the input features and estimated parameters.

    Parameters:
    features (np.array): The array of input features (excluding the intercept).
    beta (np.array): The estimated OLS parameters including the intercept.

    Returns:
    float: The predicted price.
    """
    # Create an array for the input features including the intercept term
    X_new = np.hstack([np.ones((features.shape[0], 1)), features])
    
    # Compute the predicted price
    price_pred = X_new @ beta
    
    return price_pred

# Implement cross-validation
def cross_validate_ols(X, y, k=5):
    """
    Performs k-fold cross-validation on the OLS model.

    Parameters:
    X (np.array): The feature matrix.
    y (np.array): The target variable.
    k (int): The number of folds for cross-validation.

    Returns:
    tuple: Mean and standard deviation of the mean squared error across folds.
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=1)
    mse_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Estimate OLS parameters
        beta_estimates = estimate_ols(y_train, X_train)

        # Predict on test set
        y_pred = predict_price(X_test, beta_estimates)

        # Calculate MSE and store the result
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores), np.std(mse_scores)

In [4]:
# Load the cleaned dataset
df_cleaned = clean_data(df_sampled)


Total number of observations (rows): 139194

Max Missing values in one column: 71133

Total number of observations (rows) After dropping rows with missing values: 74994
         brokered_by  status      price  bed  bath  acre_lot         city  \
1080824     107399.0       1   589637.0  4.0   4.0      0.23         Katy   
584856       84122.0       1   449900.0  3.0   3.0      2.17      Jemison   
1181089      49467.0       1   445000.0  4.0   2.0      0.17       Tooele   
1265285      81189.0       1  1300000.0  3.0   3.0      0.21      Arcadia   
113111       22611.0       1   849000.0  2.0   3.0      0.14  Dobbs Ferry   

              state  zip_code  house_size  
1080824       Texas   77493.0      3300.0  
584856      Alabama   35085.0      3600.0  
1181089        Utah   84074.0      2226.0  
1265285  California   91006.0      2053.0  
113111     New York   10522.0      1920.0  


In [5]:
# Select features and target variable
features = ['bed', 'bath', 'acre_lot', 'house_size', 'zip_code']
X = df_cleaned[features].values
y = df_cleaned['price'].values

# Estimate OLS parameters
beta_estimates = estimate_ols(y, X)
print("Estimated OLS Parameters:")
print(beta_estimates)

Estimated OLS Parameters:
[-41.60032559  50.14800703 -59.97534684  11.72553145  54.92685884
   7.74930654]


In [6]:
# Example input features
bed = 3
bath = 2
acre_lot = 0.5
house_size = 2000
zip_code = 90210

# Prepare the features for prediction
features_example = np.array([[bed, bath, acre_lot, house_size, zip_code]])

# Predict the price
predicted_price = predict_price(features_example, beta_estimates)

# Extract the single value from the array
predicted_price_value = predicted_price[0]

# Print the predicted price
print(f"Predicted Price: ${predicted_price_value:.2f}")

Predicted Price: $808913.42


In [7]:
# Perform cross-validation
mean_mse, std_mse = cross_validate_ols(X, y, k=5)
print(f"Mean MSE: {mean_mse}")
print(f"Standard Deviation of MSE: {std_mse}")


Mean MSE: 1631457792557.4714
Standard Deviation of MSE: 356830900232.5339
