In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Read the data

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
real_state_data = pd.read_csv("clean_data_outlier1.csv")

# Review the DataFrame
real_state_data.head()

Unnamed: 0.1,Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,0,318990176,"649 Hargraves Avenue NE, Royal City, WA 99357",Hargraves Avenue NE,Royal City,WA,46.906307,-119.622284,99357.0,399900.0,4.0,3.0,229.0,1742.0,17193.132,sqft,SINGLE_FAMILY
1,1,204997510,"15822 S Clear View Loop, Kennewick, WA 99338",S Clear View Loop,Kennewick,WA,46.19008,-119.31846,99338.0,799000.0,4.0,3.0,306.0,2611.0,26893.944,sqft,SINGLE_FAMILY
2,2,2064111934,"3602 3602/3600 W 15th Ave, Kennewick, WA 99338",3602/3600 W 15th Ave,Kennewick,WA,46.19497,-119.19278,99338.0,389000.0,4.0,2.0,167.0,2328.0,15681.6,sqft,MULTI_FAMILY
3,3,331355531,"6087 W 34th Ave, Kennewick, WA 99338",W 34th Ave,Kennewick,WA,46.178677,-119.20451,99338.0,525000.0,4.0,3.0,220.0,2380.0,9583.0,sqft,SINGLE_FAMILY
4,4,85939771,"1506 S Olson St, Kennewick, WA 99338",S Olson St,Kennewick,WA,46.194546,-119.17232,99338.0,350000.0,3.0,2.0,228.0,1529.0,12632.4,sqft,SINGLE_FAMILY


In [3]:
# Delete the unnamed column
real_state_data = real_state_data.drop('Unnamed: 0', axis=1)

# Display
real_state_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,318990176,"649 Hargraves Avenue NE, Royal City, WA 99357",Hargraves Avenue NE,Royal City,WA,46.906307,-119.622284,99357.0,399900.0,4.0,3.0,229.0,1742.0,17193.132,sqft,SINGLE_FAMILY
1,204997510,"15822 S Clear View Loop, Kennewick, WA 99338",S Clear View Loop,Kennewick,WA,46.19008,-119.31846,99338.0,799000.0,4.0,3.0,306.0,2611.0,26893.944,sqft,SINGLE_FAMILY
2,2064111934,"3602 3602/3600 W 15th Ave, Kennewick, WA 99338",3602/3600 W 15th Ave,Kennewick,WA,46.19497,-119.19278,99338.0,389000.0,4.0,2.0,167.0,2328.0,15681.6,sqft,MULTI_FAMILY
3,331355531,"6087 W 34th Ave, Kennewick, WA 99338",W 34th Ave,Kennewick,WA,46.178677,-119.20451,99338.0,525000.0,4.0,3.0,220.0,2380.0,9583.0,sqft,SINGLE_FAMILY
4,85939771,"1506 S Olson St, Kennewick, WA 99338",S Olson St,Kennewick,WA,46.194546,-119.17232,99338.0,350000.0,3.0,2.0,228.0,1529.0,12632.4,sqft,SINGLE_FAMILY


In [4]:
arizona_data = real_state_data[real_state_data['state'] == "AZ"]
arizona_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
69924,8352055,"854 Citrus St, Bullhead City, AZ 86442",Citrus St,Bullhead City,AZ,35.11426,-114.618385,86442.0,54900.0,2.0,1.0,79.0,688.0,5000.0,sqft,MANUFACTURED
69925,104049502,"411 Riverfront Dr LOT 5, Bullhead City, AZ 86442",Riverfront Dr LOT 5,Bullhead City,AZ,35.1137,-114.63446,86442.0,2200000.0,6.0,8.0,425.0,5173.0,6534.0,sqft,SINGLE_FAMILY
69926,64954116,"1825 E Shore Villas Dr UNIT 33, Bullhead City,...",E Shore Villas Dr UNIT 33,Bullhead City,AZ,35.1227,-114.58975,86442.0,379900.0,3.0,3.0,230.0,1649.0,3640.0,sqft,SINGLE_FAMILY
69927,71677227,"790 Stahlman Dr, Bullhead City, AZ 86442",Stahlman Dr,Bullhead City,AZ,35.1181,-114.62062,86442.0,549900.0,3.0,4.0,169.0,3250.0,10225.0,sqft,SINGLE_FAMILY
69928,8351548,"905 Citrus St, Bullhead City, AZ 86442",Citrus St,Bullhead City,AZ,35.114693,-114.61625,86442.0,169900.0,3.0,2.0,146.0,1157.0,5000.0,sqft,MANUFACTURED


In [5]:
# Set index to property_id
arizona_data = arizona_data.set_index('property_id')

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
arizona_data_scaled = StandardScaler().fit_transform(arizona_data[['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']])

In [7]:
# Create a DataFrame with the scaled data

# Copy the property id from the original data

df_arizona_scaled = pd.DataFrame(
    arizona_data_scaled,
    columns=['latitude',
       'longitude','price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']
)

# Set the property_id column as index
df_arizona_scaled["property_id"] = arizona_data.index
df_arizona_scaled = df_arizona_scaled.set_index("property_id")

# Display sample data
df_arizona_scaled.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8352055,2.212841,-2.821027,-1.146725,-1.207285,-1.498529,-1.736493,-1.529693,-0.557056
104049502,2.212145,-2.838025,3.981389,2.878758,5.887922,1.409227,4.005264,-0.288013
64954116,2.223324,-2.790747,-0.369775,-0.185774,0.611886,-0.36365,-0.343719,-0.795581
71677227,2.217611,-2.82339,0.036631,-0.185774,1.667093,-0.918242,1.632082,0.359337
8351548,2.213379,-2.818769,-0.871804,-0.185774,-0.443322,-1.127351,-0.950898,-0.557056


## Initialize the K-Means model

In [8]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [9]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_arizona_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,147936.0
1,2,116497.830969
2,3,100003.464654
3,4,86882.284368
4,5,78156.381506


In [11]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
elbow_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [12]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [13]:
# Fit the K-Means model using the scaled data
model.fit(df_arizona_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_arizona_scaled)

# Print the resulting array of cluster values.
print(clusters)

[3 1 3 ... 3 0 0]


In [15]:
# Create a copy of the DataFrame
df_arizona_predictions = df_arizona_scaled.copy()

In [16]:
# Add a new column to the DataFrame with the predicted clusters
df_arizona_predictions["cluster"] = clusters

# Display sample data
df_arizona_predictions.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,cluster
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8352055,2.212841,-2.821027,-1.146725,-1.207285,-1.498529,-1.736493,-1.529693,-0.557056,3
104049502,2.212145,-2.838025,3.981389,2.878758,5.887922,1.409227,4.005264,-0.288013,1
64954116,2.223324,-2.790747,-0.369775,-0.185774,0.611886,-0.36365,-0.343719,-0.795581,3
71677227,2.217611,-2.82339,0.036631,-0.185774,1.667093,-0.918242,1.632082,0.359337,0
8351548,2.213379,-2.818769,-0.871804,-0.185774,-0.443322,-1.127351,-0.950898,-0.557056,3


In [17]:
# Create a scatter plot using hvPlot
arizona_predictions_plot = df_arizona_predictions.hvplot.scatter(
    x="price",
    y="living_space",
    by="cluster",
    hover_cols = "property_id",
    xlabel = "Price",
    ylabel = "Living Space",
    title = "Arizona"
)
arizona_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [18]:
# Create a scatter plot using hvPlot
arizona_predictions_plot = df_arizona_predictions.hvplot.scatter(
    x="price",
    y="bedroom_number",
    by="cluster",
    hover_cols = "property_id",
    xlabel = "Price",
    ylabel = "Bedroom Number",
    title = "Arizona"
)
arizona_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Optimize Clusters with Principal Component Analysis

In [19]:
# Create a PCA model instance and set `n_components=3`.
pca=PCA(n_components=3)

In [20]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
arizona_pca = pca.fit_transform(df_arizona_scaled)

# View the first five rows of the DataFrame. 
arizona_pca[:5]

array([[-2.74982807,  2.69993048, -2.72426556],
       [ 8.16169975,  1.57458416, -2.57307549],
       [-0.20195494,  2.58619454, -2.51840585],
       [ 1.68256145,  1.89004159, -3.11164794],
       [-1.33769638,  2.43318566, -2.89977274]])

In [21]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.38893503, 0.2115282 , 0.1455428 ])

In [22]:
# Create a new DataFrame with the PCA data.
df_arizona_pca = pd.DataFrame(arizona_pca,
                            columns=["PC1", "PC2", "PC3"])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_arizona_pca["property_id"] = arizona_data.index

# Set the property_id column as index

df_arizona_pca = df_arizona_pca.set_index("property_id")

# Display sample data
df_arizona_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8352055,-2.749828,2.69993,-2.724266
104049502,8.1617,1.574584,-2.573075
64954116,-0.201955,2.586195,-2.518406
71677227,1.682561,1.890042,-3.111648
8351548,-1.337696,2.433186,-2.899773


In [23]:
# Create a list with the number of k-values from 1 to 11
k_2 = list(range(1, 11))

In [24]:
# Create an empty list to store the inertia values
inertia_2=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k_2:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_arizona_pca)
    inertia_2.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca = {"k": k_2, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,110361.147394
1,2,78989.2477
2,3,62598.851636
3,4,49953.34359
4,5,43202.147033


In [26]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve using PCA data", 
    xticks=k_2
)
elbow_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [27]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [28]:
# Fit the K-Means model using the PCA data
model.fit(df_arizona_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
# Predict the clusters using the PCA data
k_3 = model.predict(df_arizona_pca)
# Print the resulting array of cluster values.
k_3

array([1, 3, 1, ..., 1, 0, 0], dtype=int32)

In [30]:
# Create a copy of the DataFrame with the PCA data
df_arizona_predictions_pca = df_arizona_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_arizona_predictions_pca["predicted_clusters"] = k_3

# Display sample data
df_arizona_predictions_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8352055,-2.749828,2.69993,-2.724266,1
104049502,8.1617,1.574584,-2.573075,3
64954116,-0.201955,2.586195,-2.518406,1
71677227,1.682561,1.890042,-3.111648,0
8351548,-1.337696,2.433186,-2.899773,1


In [31]:
# Create a scatter plot using hvPlot
arizona_predictions_pca_plot = df_arizona_predictions_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_clusters",
    hover_cols="property_id"
)
arizona_predictions_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [32]:
from sklearn.decomposition import PCA

# Perform PCA on your dataset
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_arizona_scaled)

# Get the loadings (component loadings) associated with each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space'])

# Display the top contributing features for each principal component
for i, pc in enumerate(loadings_df.index):
    print(f"Principal Component {i + 1}:")
    features = loadings_df.iloc[i].abs().sort_values(ascending=False).head(3)
    print(features)
    print()

Principal Component 1:
living_space       0.505002
price              0.493664
bathroom_number    0.435294
Name: 0, dtype: float64

Principal Component 2:
latitude          0.602014
longitude         0.509980
price_per_unit    0.441787
Name: 1, dtype: float64

Principal Component 3:
price_per_unit    0.563132
longitude         0.556198
bedroom_number    0.376380
Name: 2, dtype: float64

