In [1]:
# Import the modules
import pandas as pd
import hvplot.pandas
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Read the "realtor-data-zip.csv" file into a Pandas DataFrame
realtor_df = pd.read_csv(
     Path("Resources/realtor-data.zip.csv")
 )

# Review the DataFrame
realtor_df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [3]:
# Display number of rows in DataFrame
a = len(realtor_df)
print(a)

1401066


In [4]:
# Display DataFrame summary information
realtor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401066 entries, 0 to 1401065
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   status          1401066 non-null  object 
 1   bed             1184538 non-null  float64
 2   bath            1206853 non-null  float64
 3   acre_lot        1043599 non-null  float64
 4   city            1400875 non-null  object 
 5   state           1401066 non-null  object 
 6   zip_code        1400587 non-null  float64
 7   house_size      950954 non-null   float64
 8   prev_sold_date  714773 non-null   object 
 9   price           1400958 non-null  float64
dtypes: float64(6), object(4)
memory usage: 106.9+ MB


In [5]:
# Created a new column showing price per square foot 
realtor_df['price_sq_foot'] = realtor_df['price'] / realtor_df['house_size']

In [6]:
# Review DataFrame
realtor_df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price,price_sq_foot
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0,114.130435
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0,52.390308
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0,89.572193
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0,80.555556
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0,


In [7]:
# Dropped non-relevant columns from DataFrame
realtor_df.drop(columns=['prev_sold_date','state','status'], axis=1, inplace=True)

In [8]:
# Dropped rows with missing values 
realtor_df = realtor_df.dropna()

In [9]:
# Review DataFrame
realtor_df.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price,price_sq_foot
0,3.0,2.0,0.12,Adjuntas,601.0,920.0,105000.0,114.130435
1,4.0,2.0,0.08,Adjuntas,601.0,1527.0,80000.0,52.390308
2,2.0,1.0,0.15,Juana Diaz,795.0,748.0,67000.0,89.572193
3,4.0,2.0,0.1,Ponce,731.0,1800.0,145000.0,80.555556
5,4.0,3.0,0.46,San Sebastian,612.0,2520.0,179000.0,71.031746


In [10]:
# Display number of rows in DataFrame
a = len(realtor_df)
print(a)

680464


In [11]:
# Filter DataFrame to only show houses with price per square foot between 100 and 150
realtor_df = realtor_df[(realtor_df['price_sq_foot'] >= 100.0) & (realtor_df['price_sq_foot'] <= 150.0)]

In [12]:
# Display number of rows in DataFrame
a = len(realtor_df)
print(a)

91054


In [13]:
# Review DataFrame
realtor_df.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price,price_sq_foot
0,3.0,2.0,0.12,Adjuntas,601.0,920.0,105000.0,114.130435
11,3.0,2.0,0.08,Juana Diaz,795.0,1045.0,150000.0,143.54067
15,3.0,2.0,0.08,Yauco,698.0,1100.0,120000.0,109.090909
18,3.0,2.0,3.88,San Sebastian,685.0,4000.0,575000.0,143.75
19,6.0,3.0,0.25,Anasco,610.0,1230.0,140000.0,113.821138


In [16]:
# Create a scatter plot to compare the house_size and price columns. 
df_plot = realtor_df.hvplot.scatter(
    x="house_size",
    y="price",
    title="Expected Home Price Based on House Size"
)
df_plot

In [17]:
# Select the "house_size" column as the independent variable, X, and reformat the data as a single-column array
X = realtor_df["house_size"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[ 920.],
       [1045.],
       [1100.],
       [4000.],
       [1230.]])

In [18]:
# Checking the shape of  X
X.shape

(91054, 1)

In [19]:
# Define target variable
y = realtor_df['price']

In [20]:
# Initialize a linear regression model with scikit-learn
model = LinearRegression()

In [21]:
# Fit the data into the model
model.fit(X, y)

In [22]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [121.97557224]


In [23]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 7788.330856125045


In [24]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 7788.330856125045 + 121.9755722403633X


In [25]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [26]:
# Create a copy of the original data
df_realtor_predicted = realtor_df.copy()

# Add a column with the predicted price values
df_realtor_predicted["price_predicted"] = predicted_y_values

# Display sample data
df_realtor_predicted.head()

Unnamed: 0,bed,bath,acre_lot,city,zip_code,house_size,price,price_sq_foot,price_predicted
0,3.0,2.0,0.12,Adjuntas,601.0,920.0,105000.0,114.130435,120005.857317
11,3.0,2.0,0.08,Juana Diaz,795.0,1045.0,150000.0,143.54067,135252.803847
15,3.0,2.0,0.08,Yauco,698.0,1100.0,120000.0,109.090909,141961.460321
18,3.0,2.0,3.88,San Sebastian,685.0,4000.0,575000.0,143.75,495690.619818
19,6.0,3.0,0.25,Anasco,610.0,1230.0,140000.0,113.821138,157818.284712


In [27]:
# Create a line plot of house_size versus the predicted price values
best_fit_line = df_realtor_predicted.hvplot.line(
    x = "house_size",
    y = "price_predicted",
    color = "purple"
)
best_fit_line

In [28]:
# Superpose the original data and the best fit line
df_plot * best_fit_line

In [29]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9604620934831707.
The r2 is 0.9604620934831707.
The mean squared error is 1541970082.9353237.
The root mean squared error is 39267.926898874146.
The standard deviation is 197483.64692489404.
