In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Read the data

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
real_state_data = pd.read_csv("raw_data_copy.csv")

# Review the DataFrame
real_state_data.head()

Unnamed: 0.1,Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,1,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,4,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,7,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,44,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,48,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [3]:
# Delete the unnamed column
real_state_data = real_state_data.drop('Unnamed: 0', axis=1)

# Display
real_state_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [4]:
california_data = real_state_data[real_state_data['state'] == "CA"]
california_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
35441,124738796,"11647 Henness Rd, Truckee, CA 96161",Henness Rd,Truckee,CA,39.34353,-120.16769,96161.0,2350000.0,4.0,3.0,896.0,2622.0,26136.0,sqft,SINGLE_FAMILY
35442,19456203,"12337 Saint Bernard Dr, Truckee, CA 96161",Saint Bernard Dr,Truckee,CA,39.353893,-120.21115,96161.0,995000.0,3.0,3.0,577.0,1722.0,14374.8,sqft,SINGLE_FAMILY
35443,205278378,"12721 Granite Dr, Truckee, CA 96161",Granite Dr,Truckee,CA,39.339485,-120.190155,96161.0,2800000.0,4.0,6.0,821.0,3410.0,28749.6,sqft,SINGLE_FAMILY
35444,114451084,"10615 Sara Bear Ln, Truckee, CA 96161",Sara Bear Ln,Truckee,CA,39.32298,-120.174446,96161.0,1275000.0,3.0,3.0,747.0,1706.0,5227.2,sqft,SINGLE_FAMILY
35445,19434659,"11553 E Ridge Rd, Truckee, CA 96161",E Ridge Rd,Truckee,CA,39.33503,-120.15787,96161.0,949900.0,5.0,3.0,443.0,2142.0,10018.8,sqft,SINGLE_FAMILY


In [5]:
# Plot your data to see what's in your DataFrame
#texas_df.hvplot.line(width=800, height=400,rot=90)

In [6]:
# Set index to property_id
california_data = california_data.set_index('property_id')

In [7]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
california_data_scaled = StandardScaler().fit_transform(california_data[['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']])

In [8]:
# Create a DataFrame with the scaled data

# Copy the property id from the original data

df_california_scaled = pd.DataFrame(
    california_data_scaled,
    columns=['latitude',
       'longitude','price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']
)

# Set the property_id column as index
df_california_scaled["property_id"] = california_data.index
df_california_scaled = df_california_scaled.set_index("property_id")

# Display sample data
df_california_scaled.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
124738796,1.727267,-0.461691,0.384471,0.027731,0.021666,0.006858,-0.0038,-0.011087
19456203,1.732053,-0.483288,-0.078316,-0.039461,0.021666,-0.010855,-0.008511,-0.011132
205278378,1.725398,-0.472855,0.538164,0.027731,0.345487,0.002694,0.000326,-0.011077
114451084,1.717775,-0.465049,0.017315,-0.039461,0.021666,-0.001415,-0.008595,-0.011168
19434659,1.723341,-0.456812,-0.093719,0.094922,0.021666,-0.018296,-0.006312,-0.011149


## Initialize the K-Means model

In [9]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [10]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_california_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,529512.0
1,2,419269.128815
2,3,314621.702817
3,4,250708.131231
4,5,197196.048848


In [12]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
elbow_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [13]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=6)

In [14]:
# Fit the K-Means model using the scaled data
model.fit(df_california_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_california_scaled)

# Print the resulting array of cluster values.
print(clusters)

[1 1 1 ... 0 0 0]


In [16]:
# Create a copy of the DataFrame
df_california_predictions = df_california_scaled.copy()

In [17]:
# Add a new column to the DataFrame with the predicted clusters
df_california_predictions["cluster"] = clusters

# Display sample data
df_california_predictions.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,cluster
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
124738796,1.727267,-0.461691,0.384471,0.027731,0.021666,0.006858,-0.0038,-0.011087,1
19456203,1.732053,-0.483288,-0.078316,-0.039461,0.021666,-0.010855,-0.008511,-0.011132,1
205278378,1.725398,-0.472855,0.538164,0.027731,0.345487,0.002694,0.000326,-0.011077,1
114451084,1.717775,-0.465049,0.017315,-0.039461,0.021666,-0.001415,-0.008595,-0.011168,1
19434659,1.723341,-0.456812,-0.093719,0.094922,0.021666,-0.018296,-0.006312,-0.011149,1


In [18]:
# Create a scatter plot using hvPlot
california_predictions_plot = df_california_predictions.hvplot.scatter(
    x="price",
    y="land_space",
    by="cluster",
    hover_cols = "property_id"
)
california_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Optimize Clusters with Principal Component Analysis

In [19]:
# Create a PCA model instance and set `n_components=3`.
pca=PCA(n_components=3)

In [20]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
california_pca = pca.fit_transform(df_california_scaled)

# View the first five rows of the DataFrame. 
california_pca[:5]

array([[-1.34270816,  0.73804825,  0.2710116 ],
       [-1.42496222,  0.65726245, -0.03544821],
       [-1.23413526,  0.96114242,  0.3591247 ],
       [-1.39481408,  0.6568352 ,  0.02877765],
       [-1.36445767,  0.72988856, -0.05950056]])

In [21]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.24184406, 0.22809646, 0.13267315])

In [22]:
# Create a new DataFrame with the PCA data.
df_california_pca = pd.DataFrame(california_pca,
                            columns=["PC1", "PC2", "PC3"])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_california_pca["property_id"] = california_data.index

# Set the property_id column as index

df_california_pca = df_california_pca.set_index("property_id")

# Display sample data
df_california_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
124738796,-1.342708,0.738048,0.271012
19456203,-1.424962,0.657262,-0.035448
205278378,-1.234135,0.961142,0.359125
114451084,-1.394814,0.656835,0.028778
19434659,-1.364458,0.729889,-0.059501


In [23]:
# Create a list with the number of k-values from 1 to 11
k_2 = list(range(1, 11))

In [24]:
# Create an empty list to store the inertia values
inertia_2=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k_2:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_california_pca)
    inertia_2.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca = {"k": k_2, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,319091.168619
1,2,208865.818461
2,3,105937.170735
3,4,68955.087162
4,5,51577.834492


In [26]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve using PCA data", 
    xticks=k_2
)
elbow_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [28]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=3)

In [29]:
# Fit the K-Means model using the PCA data
model.fit(df_california_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# Predict the clusters using the PCA data
k_3 = model.predict(df_california_pca)
# Print the resulting array of cluster values.
k_3

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

In [31]:
# Create a copy of the DataFrame with the PCA data
df_california_predictions_pca = df_california_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_california_predictions_pca["predicted_clusters"] = k_3

# Display sample data
df_california_predictions_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
124738796,-1.342708,0.738048,0.271012,1
19456203,-1.424962,0.657262,-0.035448,1
205278378,-1.234135,0.961142,0.359125,1
114451084,-1.394814,0.656835,0.028778,1
19434659,-1.364458,0.729889,-0.059501,1


In [32]:
# Create a scatter plot using hvPlot
california_predictions_pca_plot = df_california_predictions_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_clusters",
    hover_cols="property_id"
)
california_predictions_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [33]:
from sklearn.decomposition import PCA

# Perform PCA on your dataset (replace X with your data)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_california_scaled)

# Get the loadings (component loadings) associated with each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space'])  # Replace 'X.columns' with your feature names

# Display the top contributing features for each principal component
for i, pc in enumerate(loadings_df.index):
    print(f"Principal Component {i + 1}:")
    top_features = loadings_df.iloc[i].abs().sort_values(ascending=False).head(3)  # Top 3 features
    print(top_features)
    print()

Principal Component 1:
latitude           0.638615
longitude          0.632966
bathroom_number    0.306862
Name: 0, dtype: float64

Principal Component 2:
bedroom_number     0.632896
bathroom_number    0.632252
longitude          0.311817
Name: 1, dtype: float64

Principal Component 3:
living_space      0.718155
price             0.664586
price_per_unit    0.168451
Name: 2, dtype: float64

