## Import data


In [None]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from joblib import dump,load
import datetime as dt
import matplotlib.pyplot as plt


In [None]:
if not os.path.exists('usa_housing.csv'):
    !wget "https://drive.usercontent.google.com/download?id=1fqh567slCa7vPhHGIQ1g7szImts_71mx&export=download&authuser=0&confirm=t&uuid=c615d267-33d6-4054-a976-c35f60a4eb69&at=APZUnTUTVXjjKAd6tRAj7Whb8rX6:1700030483444" -O 'usa_housing.csv'

df = pd.read_csv('usa_housing.csv')

In [None]:
df

In [None]:
print(df.info())

#we have A LOT of null values

## Clean up lines and remove useless columns

I cleaned up this data by removing all rows with null values and also remove the following columns:

  zip_code, status, city



In [None]:
df.dropna(inplace=True) #remove NaN vlaues

columns_to_remove = ['status','zip_code','city'] #useless columns

df.drop(columns_to_remove,inplace=True,axis=1) #drops the useless columns

df = df.reset_index(drop=True) #fixes the indexes

In [None]:
print(df.info())

## Convert to date to year only and include years after 2010

In [None]:
df['prev_sold_date'] = df['prev_sold_date'].astype(str).str[:4]

df['prev_sold_date'] = pd.to_numeric(df['prev_sold_date'])

df = df[df['prev_sold_date'] >= 2010]

df = df.reset_index(drop=True)

from collections import Counter

word_counts = Counter(df['state'])

word_counts_dict = dict(word_counts)

print(word_counts_dict)
print(len(word_counts_dict))

## Remove prev_sold_date column

The year column is being removed since all data is now after 2010, which is after the 2008 crash, so it's safe to assume the prices are reasonable

In [None]:
df.drop(['prev_sold_date'],inplace=True,axis=1)

In [None]:
print(df.info())

## Remove least occuring states

Since Puerto Rico and Virgin Islands only have 6 we remove them

In [None]:
states_to_delete = ['Puerto Rico','Virgin Islands'] # have only 20 and 6 listings

df = df[~df['state'].isin(states_to_delete)]
df = df.reset_index(drop=True) #fixes the indexes

word_counts = Counter(df['state'])

word_counts_dict = dict(word_counts)

print(word_counts_dict)
print(len(word_counts_dict))

## Method 1: Label Encode states and remove states column

we assign an encode value to each state, create a mapping dictionary, and remove the states column

In [None]:
df2 = df.copy()

label_encoder = LabelEncoder()
df['state_encoded'] = label_encoder.fit_transform(df['state'])

mapping = dict(zip(df['state'], label_encoder.fit_transform(df['state']))) #mapping dict

df.drop(['state'],inplace=True,axis=1)

print(mapping)

## Method 2: separate dataset into subsets

We separate df2 into subsets based on states

In [None]:
grouped = df2.groupby('state')

df_dict = {state: group for state, group in grouped} #dictionary of group data

for name,item in df_dict.items():
  item.drop(['state'],inplace=True,axis=1)
  print(name)
  makeModel(item)

# Housing Model

In [None]:
def makeModel(dataframe):
  X = dataframe.drop("price", axis=1)
  y = dataframe["price"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  model = LinearRegression()
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  print(f"Mean Squared Error: {mse}")
  print(f"R^2 Score: {r2}")

  return model


## If you want to save the model, use the dump command

In [None]:
dump(model, 'kc_housingmodel.pkl')

# Predict

In [None]:
# test data - USES METHOD 2 befault

USEMETHOD1 = 0

bed   =   float(input("Enter how many beds: "))
bath  =   float(input("Enter how many bathrooms: "))
acres =   float(input("Enter how many acres the land is: "))
size  =   float(input("Enter how the size of the house in square feet: "))

print("Choose one of the following states:")
for state in word_counts_dict.keys():
    print(f"{state}")

state =   input("Enter the number of your chosen state: ")

if USEMETHOD1:
  state = mapping[state]
  model = makeModel(df)
else:
  model = makeModel(df_dict[state])

data = {
    'bed': [bed],
    'bath': [bath],
    'acre_lot': [acres],
    'house_size': [size],
}

if USEMETHOD1:
  data.update({'state_encoded': [state]})
  print("Method 1")
else:
  print("Method 2")

input_df = pd.DataFrame(data)

prediction = list(model.predict(input_df))

# Print the predictions
print("Predicted Price:")
print(f"${prediction[0]:.2f}")

In [None]:
#Scatter Plot
temp = df_dict[state]

tempBeds = temp['bed']
tempBaths = temp['bath']
tempAcres = temp['acre_lot']
tempSize = temp['house_size']
tempPrice = temp['price']

plt.scatter(tempAcres,tempPrice,c='blue',marker='o',label='Collected Data')

plt.scatter(acres,prediction,c='red', marker='o', label='Input Point', s=100)

plt.xlabel('Acre Lot')
plt.ylabel('Price')
plt.title('Acre Lot vs. Price')

plt.legend()

plt.grid(True)
plt.show()
