In [3]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [15]:
# Read in data from CSV from kaggle
# https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset
realtor_df = pd.read_csv(
    Path("Resources/realtor-data.zip.csv")
)

# Review the DataFrame
realtor_df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [16]:
realtor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401066 entries, 0 to 1401065
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   status          1401066 non-null  object 
 1   bed             1184538 non-null  float64
 2   bath            1206853 non-null  float64
 3   acre_lot        1043599 non-null  float64
 4   city            1400875 non-null  object 
 5   state           1401066 non-null  object 
 6   zip_code        1400587 non-null  float64
 7   house_size      950954 non-null   float64
 8   prev_sold_date  714773 non-null   object 
 9   price           1400958 non-null  float64
dtypes: float64(6), object(4)
memory usage: 106.9+ MB


In [17]:
# Find the total count of records by state so that we select a dataset that is large enough.
realtor_df['state'].value_counts()

state
New York          653061
New Jersey        256551
Massachusetts     177170
Connecticut        98816
New Hampshire      51394
Vermont            48230
Maine              36650
Rhode Island       29610
Puerto Rico        24679
Pennsylvania       20060
Virgin Islands      2573
Delaware            2135
Georgia               50
Virginia              31
South Carolina        25
Tennessee             20
West Virginia          5
Wyoming                3
Louisiana              3
Name: count, dtype: int64

In [18]:
# Drop the 'Prev_Sold_Date' column because we do not need it for our model.
realtor_df.drop(columns=['prev_sold_date'], inplace=True)

In [19]:
# Drop any rows with NaN values.
df = realtor_df.dropna()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 680464 entries, 0 to 1401065
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   status      680464 non-null  object 
 1   bed         680464 non-null  float64
 2   bath        680464 non-null  float64
 3   acre_lot    680464 non-null  float64
 4   city        680464 non-null  object 
 5   state       680464 non-null  object 
 6   zip_code    680464 non-null  float64
 7   house_size  680464 non-null  float64
 8   price       680464 non-null  float64
dtypes: float64(6), object(3)
memory usage: 51.9+ MB


In [21]:
# Review new state counts after dropping NaN rows.
df['state'].value_counts()

state
New York          298457
Massachusetts     104882
New Jersey         76123
Connecticut        73505
Rhode Island       24632
New Hampshire      24454
Vermont            23305
Maine              23010
Puerto Rico        15390
Pennsylvania       14649
Delaware            1707
Virgin Islands       342
West Virginia          5
Wyoming                3
Name: count, dtype: int64

In [22]:
# We decided to select only those rows with the state = Massachusetts.
MA_df = df.loc[(df['state'])=='Massachusetts']
MA_df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
24231,for_sale,2.0,1.0,0.34,Agawam,Massachusetts,1001.0,676.0,180000.0
24236,for_sale,3.0,1.0,0.46,Agawam,Massachusetts,1001.0,1196.0,239900.0
24237,for_sale,3.0,3.0,0.45,Agawam,Massachusetts,1001.0,2314.0,525000.0
24238,for_sale,3.0,2.0,0.36,Agawam,Massachusetts,1001.0,1276.0,289900.0
24241,for_sale,4.0,2.0,0.11,Agawam,Massachusetts,1001.0,1732.0,275000.0


In [23]:
# Verify length of dataframe
a = len(MA_df)
print(a)

104882


In [27]:
# We decided to output the dataframe so that everyone on the team could use it.
MA_df.to_csv('Resources/MA_data.csv', index=False)

Regression: If you are trying to predict a continuous value, such as predicting house prices based on the other features, then the 'price' column would likely be the target variable.
Therefore, the "price" column is our target

In [28]:
# Import required libraries
import numpy as np
from sklearn.linear_model import LinearRegression

In [29]:
#Read new csv file in
file_path = Path("Resources/MA_data.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,2.0,1.0,0.34,Agawam,Massachusetts,1001.0,676.0,180000.0
1,for_sale,3.0,1.0,0.46,Agawam,Massachusetts,1001.0,1196.0,239900.0
2,for_sale,3.0,3.0,0.45,Agawam,Massachusetts,1001.0,2314.0,525000.0
3,for_sale,3.0,2.0,0.36,Agawam,Massachusetts,1001.0,1276.0,289900.0
4,for_sale,4.0,2.0,0.11,Agawam,Massachusetts,1001.0,1732.0,275000.0


In [40]:
# check length of dataframe.
a = len(df)
print(a)

104882


In [41]:
# Create a scatter plot to compare the house_size and price columns. 
df_plot = df.hvplot.scatter(
    x="house_size",
    y="price",
    title="Expected Home Price Based on House Size"
)
df_plot

In [30]:
# Reformat data of the independent variable X as a single-column array
X = df["house_size"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[ 676.],
       [1196.],
       [2314.],
       [1276.],
       [1732.]])

In [31]:
X.shape

(104882, 1)

In [32]:
# Create an array for the dependent variable y
y = df["price"]

In [33]:
# Create a model with scikit-learn
model = LinearRegression()

In [34]:
# Fit the data into the model
model.fit(X, y)

In [35]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [472.49347747]


In [36]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -171996.38798598084


In [37]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -171996.38798598084 + 472.4934774701013X


In [38]:
# Display the formula to predict the house price for a house size of 2000 sqft
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2000")

# Predict the salary for a person with 7 years of experience
y_2000 = model.intercept_ + model.coef_[0] * 2000

# Display the prediction
print(f"Predicted the house price for a house size of 2000 sqft: ${y_2000:.2f}")

Model's formula: y = -171996.38798598084 + 472.4934774701013 * 2000
Predicted the house price for a house size of 2000 sqft: $772990.57


In [39]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [40]:
# Create a copy of the original data
df_price_predicted = df.copy()

# Add a column with the predicted salary values
df_price_predicted["price_predicted"] = predicted_y_values

# Display sample data
df_price_predicted.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price,price_predicted
0,for_sale,2.0,1.0,0.34,Agawam,Massachusetts,1001.0,676.0,180000.0,147409.202784
1,for_sale,3.0,1.0,0.46,Agawam,Massachusetts,1001.0,1196.0,239900.0,393105.811068
2,for_sale,3.0,3.0,0.45,Agawam,Massachusetts,1001.0,2314.0,525000.0,921353.51888
3,for_sale,3.0,2.0,0.36,Agawam,Massachusetts,1001.0,1276.0,289900.0,430905.289266
4,for_sale,4.0,2.0,0.11,Agawam,Massachusetts,1001.0,1732.0,275000.0,646362.314992


In [43]:
# Create a line plot of house_size versus the predicted price values
best_fit_line = df_price_predicted.hvplot.line(
    x = "house_size",
    y = "price_predicted",
    color = "purple"
)
best_fit_line

In [48]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [49]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.4131049946454848.
The r2 is 0.4131049946454848.
The mean squared error is 1329855249948.8508.
The root mean squared error is 1153193.50065323.
The standard deviation is 1505296.248486024.


In [None]:
# For this linear regression, the r2 = 41%. 