In [1]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import numpy as np

In [2]:
realtor_df = pd.read_csv(
     Path("Resources/realtor-data.zip.csv")
 )

# # Review the DataFrame
realtor_df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [3]:
a = len(realtor_df)
print(a)

1401066


In [4]:
realtor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401066 entries, 0 to 1401065
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   status          1401066 non-null  object 
 1   bed             1184538 non-null  float64
 2   bath            1206853 non-null  float64
 3   acre_lot        1043599 non-null  float64
 4   city            1400875 non-null  object 
 5   state           1401066 non-null  object 
 6   zip_code        1400587 non-null  float64
 7   house_size      950954 non-null   float64
 8   prev_sold_date  714773 non-null   object 
 9   price           1400958 non-null  float64
dtypes: float64(6), object(4)
memory usage: 106.9+ MB


In [5]:
#Filtering by State & City
realtor_df = realtor_df[(realtor_df['state']=="Massachusetts") & (realtor_df['city']=="Boston")]

In [6]:
#Drop columns that are redundant or not needed. 
realtor_df.drop(columns=['prev_sold_date','state','status'], axis=1, inplace=True)
realtor_df = realtor_df.dropna()

In [7]:
realtor_df.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price
100371,1.0,1.0,0.02,Boston,2135.0,688.0,650000.0
100372,2.0,1.0,0.03,Boston,2130.0,1250.0,739000.0
100374,9.0,3.0,0.09,Boston,2131.0,3204.0,1200000.0
100378,3.0,3.0,0.06,Boston,2130.0,2623.0,949000.0
100811,4.0,7.0,1.0,Boston,2115.0,4754.0,9750000.0


In [8]:
a = len(realtor_df)
print(a)

13414


In [10]:
realtor_df.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price
100371,1.0,1.0,0.02,Boston,2135.0,688.0,650000.0
100372,2.0,1.0,0.03,Boston,2130.0,1250.0,739000.0
100374,9.0,3.0,0.09,Boston,2131.0,3204.0,1200000.0
100378,3.0,3.0,0.06,Boston,2130.0,2623.0,949000.0
100811,4.0,7.0,1.0,Boston,2115.0,4754.0,9750000.0


In [11]:
df_plot = realtor_df.hvplot.scatter(
    x="house_size",
    y="price",
    title="Expected Home Price Based on House Size"
)
df_plot

In [12]:
# Reformat data of the independent variable X as a single-column array
X = realtor_df["house_size"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[ 688.],
       [1250.],
       [3204.],
       [2623.],
       [4754.]])

In [13]:
X.shape

(13414, 1)

In [14]:

y = realtor_df['price']

In [15]:
# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")


Model's slope: [488.56937115]
Model's y-intercept: 398343.4741550498
Model's formula: y = 398343.4741550498 + 488.5693711463745X


In [16]:
# Display the formula to predict the house price for a house size of 2000 sqft
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2000")

# Calculate the prediction of the house price for a house size of 2000 sqft
y_2000 = model.intercept_ + model.coef_[0] * 2000

# Display the prediction
print(f"Predicted the house price for a house size of 2000 sqft: ${y_2000:.2f}")

Model's formula: y = 398343.4741550498 + 488.5693711463745 * 2000
Predicted the house price for a house size of 2000 sqft: $1375482.22


In [17]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions using the X set
predicted_y_values = model.predict(X)

In [18]:
# Create a copy of the original data
df_realtor_predicted = realtor_df.copy()

# Add a column with the predicted salary values
df_realtor_predicted["price_predicted"] = predicted_y_values

# Display sample data
df_realtor_predicted.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price,price_predicted
100371,1.0,1.0,0.02,Boston,2135.0,688.0,650000.0,734479.2
100372,2.0,1.0,0.03,Boston,2130.0,1250.0,739000.0,1009055.0
100374,9.0,3.0,0.09,Boston,2131.0,3204.0,1200000.0,1963720.0
100378,3.0,3.0,0.06,Boston,2130.0,2623.0,949000.0,1679861.0
100811,4.0,7.0,1.0,Boston,2115.0,4754.0,9750000.0,2721002.0


In [19]:
# Create a line plot of house_size versus the price_predicted values
best_fit_line = df_realtor_predicted.hvplot.line(
    x = "house_size",
    y = "price_predicted",
    color = "purple"
)
best_fit_line

In [20]:
# Superpose the original data and the best fit line
df_plot * best_fit_line

In [21]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.4558341067042494.
The r2 is 0.4558341067042494.
The mean squared error is 1572620423873.0698.
The root mean squared error is 1254041.635621828.
The standard deviation is 1699989.7264195483.
