In [None]:
import pandas as pd
import numpy as np

# Data Cleaning

In [None]:
# Import four regions housing data
AlexHouses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/AlexandriaHomes.csv")
ArlingHouses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/ArlingtonHomes.csv")
SpringHouses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/SpringfieldHomes.csv")
BurkeHouses = pd.read_csv("https://raw.githubusercontent.com/Johnle3/MLHousingPrices/main/BurkeHomes.csv")

#subsetting cities before merge
AlexHouses = AlexHouses[AlexHouses['CITY'] == 'Alexandria']
ArlingHouses = ArlingHouses[ArlingHouses['CITY'] == 'Arlington']
SpringHouses = SpringHouses[SpringHouses['CITY'] == 'Springfield']
BurkeHouses = BurkeHouses[BurkeHouses['CITY']== 'Burke']

frames = [AlexHouses, ArlingHouses, SpringHouses, BurkeHouses]
# Merge datasets
houses1 = pd.concat(frames)

In [None]:
# Remove unnecessary columns
houses = houses1.drop(['SALE TYPE', 'SOLD DATE','ADDRESS','HOA/MONTH', 'FAVORITE', 'NEXT OPEN HOUSE START TIME','NEXT OPEN HOUSE END TIME','INTERESTED','URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)','SOURCE', 'MLS#','STATUS','LOCATION','DAYS ON MARKET'], axis = 1)

# Subset to only single homes and townhouses
houses = houses[houses['PROPERTY TYPE'] != "Vacant Land"]
houses = houses[houses['PROPERTY TYPE'] != "Condo/Co-op"]
houses = houses[houses['PROPERTY TYPE'] != "Multi-Family (2-4 Unit)"]


# EDA

In [None]:
#Null values: must be removed before ML
houses.isna().sum()

In [None]:
# Create categorical buckets for "Year Built"
i = len(houses.index)
j = 0
while j < i:
    year = houses.iloc[j, 9]
    
    if 1700 <= year < 1900:
        houses.iloc[j, 9] = "Pre-1900s"
    elif 1900 <= year < 1950:
        houses.iloc[j, 9] = '1900-1950'
    elif 1950 <= year < 2000:
        houses.iloc[j, 9] = '1950-2000'
    elif 2000 <= year <= 2023:
        houses.iloc[j, 9] = '2000-2023'
    else:
        pass
    j = j + 1

In [None]:
# Remove rows with null Year Built values -- 5 houses
houses = houses[houses["YEAR BUILT"].notna()]

In [None]:
#houses.to_csv("/Users/shannontran/Desktop/Launch/houses.csv")

In [None]:
houses['PROPERTY TYPE'].value_counts() # --> dummy vars

In [None]:
# Most expensive house
houses[houses['PRICE'] == houses['PRICE'].max()]

In [None]:
# Most affordable houses
houses[houses['PRICE'] == houses['PRICE'].min()]

In [None]:
# Distribution of property types
import matplotlib.pyplot as plt
prop_types = [houses['PROPERTY TYPE'].value_counts()[0], houses['PROPERTY TYPE'].value_counts()[1]]
#prop_types

houses['PROPERTY TYPE'].unique()

fig, ax = plt.subplots()
ax.bar(houses['PROPERTY TYPE'].unique(), height = prop_types)

In [None]:
# Distribution of cities
city_types = [houses['CITY'].value_counts()[0], houses['CITY'].value_counts()[1], houses['CITY'].value_counts()[2], houses['CITY'].value_counts()[3]]
fig, ax = plt.subplots()
ax.bar(houses['CITY'].unique(), height = city_types)

# Machine Learning: Regression

In [None]:
# Define variables
y = houses['PRICE']
X = houses[['PROPERTY TYPE', 'CITY','BEDS','BATHS','SQUARE FEET','LOT SIZE','YEAR BUILT']]    


In [None]:
from sklearn.model_selection import train_test_split

# Split data into train group and test group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =18) #random state not needed
X_columns = X_train.columns

In [None]:
# Encode categoricals
from sklearn.compose import make_column_transformer #allows tranformaation of columns based on given functions
from sklearn.preprocessing import OneHotEncoder #encoding into dummys

column_trans = make_column_transformer((OneHotEncoder(), [0, 1, 6]), remainder='passthrough')

# Now all numerical data
X_train = column_trans.fit_transform(X_train) #fit
X_test = column_trans.transform(X_test)

In [None]:
# Impute nulls in Lot Size using KNN
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 5)

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
# Scaling values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #create scaler object

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#now all standardized values

In [None]:
# T-tests for significance
import statsmodels.api as sm

X_train_1 = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_1).fit()
print(model.summary())

In [None]:
# drop insignificant variables
X_train_drop = pd.DataFrame(X_train)
X_train_drop = X_train_drop.drop(columns = [0, 1, 2, 6, 10, 13])
X_train_drop.shape
X_train_drop_array = np.asarray(df)

In [None]:
# Regression output for reduced model -- Adj-R2 increased slightly
X_train_drop = sm.add_constant(X_train_drop)
model = sm.OLS(y_train, X_train_drop_array).fit()
print(model.summary())

## Regression function

In [None]:
from sklearn import linear_model
from sklearn import svm

In [None]:
# Function that performs regression ML and outputs regression equation and R2

def ml_function(model): 
    if model == "lasso":
        clf = linear_model.Lasso(alpha = 0.2, max_iter = 10000) 
    elif model == "ridge":
        clf = linear_model.Ridge(alpha = 0.5, max_iter = 10000) 
    elif model == "elastic_net":
        clf = linear_model.ElasticNet(alpha = 0.5, l1_ratio = 0.5, max_iter = 10000)
    elif model == "linear_regression":
        clf = linear_model.LinearRegression() 
    clf.fit(X_train_drop, y_train)
    print(model + " " + "Coefficients:   " + str(clf.coef_) + "\n")
    print(model + " " + "Intercept:   " + str(clf.intercept_) + "\n")
    #y_predict = clf.predict(X_test)
    #y_true = y_test
    #print(model + " " + "R2: " + str(r2_score(y_true, y_predict)))
    print(model + " " + "R2: " + str(clf.fit(X_train_drop, y_train).score(X_train_drop, y_train)))
    

In [None]:
ml_function("lasso")

# Predicting single house price using test group

In [None]:
X_test_drop = pd.DataFrame(X_test)
X_test_drop
X_test_drop = X_test_drop.drop(columns = [0, 1, 2, 6, 10, 13])
#X_test_drop = np.asarray(df)

In [None]:
clf = linear_model.Lasso(alpha = 0.2, max_iter = 10000)
clf.fit(X_train_drop_array, y_train)
y_predict = clf.predict(X_test_drop)

y_predict[4]

In [None]:
np.asarray(y_test)[4]

## Difference in R2 when alpha changes?.... no

In [None]:
# R2 at different alphas
moving_alpha = .05
while moving_alpha < 1:
    clf_lasso = linear_model.Lasso(alpha = moving_alpha, max_iter = 100000) #What alpha do
    clf_lasso.fit(X_train, y_train)
    y_predict = clf_lasso.predict(X_test)
    score = r2_score(y_true, y_predict)
    print("alpha value: " + str(moving_alpha) + " / r2 score: " + str(score))
    moving_alpha += .05