In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Read the data

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
real_state_data = pd.read_csv("raw_data_copy.csv")

# Review the DataFrame
real_state_data.head()

Unnamed: 0.1,Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,1,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,4,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,7,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,44,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,48,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [3]:
# Delete the unnamed column
real_state_data = real_state_data.drop('Unnamed: 0', axis=1)

# Display
real_state_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [4]:
arizona_data = real_state_data[real_state_data['state'] == "AZ"]
arizona_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
114402,8352055,"854 Citrus St, Bullhead City, AZ 86442",Citrus St,Bullhead City,AZ,35.11426,-114.618385,86442.0,54900.0,2.0,1.0,79.0,688.0,5000.0,sqft,MANUFACTURED
114407,104049502,"411 Riverfront Dr LOT 5, Bullhead City, AZ 86442",Riverfront Dr LOT 5,Bullhead City,AZ,35.1137,-114.63446,86442.0,2200000.0,6.0,8.0,425.0,5173.0,6534.0,sqft,SINGLE_FAMILY
114409,64954116,"1825 E Shore Villas Dr UNIT 33, Bullhead City,...",E Shore Villas Dr UNIT 33,Bullhead City,AZ,35.1227,-114.58975,86442.0,379900.0,3.0,3.0,230.0,1649.0,3640.0,sqft,SINGLE_FAMILY
114410,71677227,"790 Stahlman Dr, Bullhead City, AZ 86442",Stahlman Dr,Bullhead City,AZ,35.1181,-114.62062,86442.0,549900.0,3.0,4.0,169.0,3250.0,10225.0,sqft,SINGLE_FAMILY
114414,52956862,"361 Riverfront Dr, Bullhead City, AZ 86442",Riverfront Dr,Bullhead City,AZ,35.112694,-114.635254,86442.0,6949000.0,7.0,9.0,976.0,7114.0,60112.8,sqft,SINGLE_FAMILY


In [5]:
# Set index to property_id
arizona_data = arizona_data.set_index('property_id')

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
arizona_data_scaled = StandardScaler().fit_transform(arizona_data[['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']])

In [7]:
# Create a DataFrame with the scaled data

# Copy the property id from the original data

df_arizona_scaled = pd.DataFrame(
    arizona_data_scaled,
    columns=['latitude',
       'longitude','price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']
)

# Set the property_id column as index
df_arizona_scaled["property_id"] = arizona_data.index
df_arizona_scaled = df_arizona_scaled.set_index("property_id")

# Display sample data
df_arizona_scaled.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8352055,2.070204,-2.795702,-0.615302,-1.037567,-0.529719,-0.088034,-1.156802,-0.01621
104049502,2.069546,-2.812376,1.592207,2.260046,1.87769,0.039863,2.557587,-0.016162
64954116,2.080133,-2.765999,-0.280846,-0.213164,0.158112,-0.032218,-0.36092,-0.016253
71677227,2.074722,-2.79802,-0.1059,-0.213164,0.502028,-0.054766,0.964996,-0.016046
52956862,2.068362,-2.813199,6.479373,3.084449,2.221606,0.243539,4.165085,-0.014473


## Initialize the K-Means model

In [8]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [9]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_arizona_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,172984.0
1,2,149218.134242
2,3,130461.795705
3,4,112490.609183
4,5,95304.741711


In [11]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
elbow_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [12]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=7)

In [13]:
# Fit the K-Means model using the scaled data
model.fit(df_arizona_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_arizona_scaled)

# Print the resulting array of cluster values.
print(clusters)

[0 1 0 ... 0 0 1]


In [15]:
# Create a copy of the DataFrame
df_arizona_predictions = df_arizona_scaled.copy()

In [16]:
# Add a new column to the DataFrame with the predicted clusters
df_arizona_predictions["cluster"] = clusters

# Display sample data
df_arizona_predictions.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,cluster
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8352055,2.070204,-2.795702,-0.615302,-1.037567,-0.529719,-0.088034,-1.156802,-0.01621,0
104049502,2.069546,-2.812376,1.592207,2.260046,1.87769,0.039863,2.557587,-0.016162,1
64954116,2.080133,-2.765999,-0.280846,-0.213164,0.158112,-0.032218,-0.36092,-0.016253,0
71677227,2.074722,-2.79802,-0.1059,-0.213164,0.502028,-0.054766,0.964996,-0.016046,0
52956862,2.068362,-2.813199,6.479373,3.084449,2.221606,0.243539,4.165085,-0.014473,5


In [17]:
# Create a scatter plot using hvPlot
arizona_predictions_plot = df_arizona_predictions.hvplot.scatter(
    x="price",
    y="land_space",
    by="cluster",
    hover_cols = "property_id"
)
arizona_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Optimize Clusters with Principal Component Analysis

In [19]:
# Create a PCA model instance and set `n_components=3`.
pca=PCA(n_components=3)

In [20]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
arizona_pca = pca.fit_transform(df_arizona_scaled)

# View the first five rows of the DataFrame. 
arizona_pca[:5]

array([[-1.79522818,  3.42331891, -0.15572644],
       [ 4.05996197,  3.44687601, -0.18851994],
       [-0.49157244,  3.39776066, -0.1584374 ],
       [ 0.52480313,  3.43035262, -0.18830949],
       [ 8.07730413,  3.80138708,  0.30505227]])

In [21]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.29363217, 0.18992401, 0.12536787])

In [22]:
# Create a new DataFrame with the PCA data.
df_arizona_pca = pd.DataFrame(arizona_pca,
                            columns=["PC1", "PC2", "PC3"])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_arizona_pca["property_id"] = arizona_data.index

# Set the property_id column as index

df_arizona_pca = df_arizona_pca.set_index("property_id")

# Display sample data
df_arizona_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8352055,-1.795228,3.423319,-0.155726
104049502,4.059962,3.446876,-0.18852
64954116,-0.491572,3.397761,-0.158437
71677227,0.524803,3.430353,-0.188309
52956862,8.077304,3.801387,0.305052


In [23]:
# Create a list with the number of k-values from 1 to 11
k_2 = list(range(1, 11))

In [24]:
# Create an empty list to store the inertia values
inertia_2=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k_2:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_arizona_pca)
    inertia_2.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca = {"k": k_2, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,105334.117672
1,2,82512.27015
2,3,63917.605431
3,4,46201.106532
4,5,35457.256346


In [26]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve using PCA data", 
    xticks=k_2
)
elbow_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [27]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [29]:
# Fit the K-Means model using the PCA data
model.fit(df_arizona_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# Predict the clusters using the PCA data
k_3 = model.predict(df_arizona_pca)
# Print the resulting array of cluster values.
k_3

array([2, 1, 2, ..., 2, 2, 2], dtype=int32)

In [31]:
# Create a copy of the DataFrame with the PCA data
df_arizona_predictions_pca = df_arizona_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_arizona_predictions_pca["predicted_clusters"] = k_3

# Display sample data
df_arizona_predictions_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8352055,-1.795228,3.423319,-0.155726,2
104049502,4.059962,3.446876,-0.18852,1
64954116,-0.491572,3.397761,-0.158437,2
71677227,0.524803,3.430353,-0.188309,2
52956862,8.077304,3.801387,0.305052,1


In [32]:
# Create a scatter plot using hvPlot
arizona_predictions_pca_plot = df_arizona_predictions_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_clusters",
    hover_cols="property_id"
)
arizona_predictions_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [33]:
from sklearn.decomposition import PCA

# Perform PCA on your dataset (replace X with your data)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_arizona_scaled)

# Get the loadings (component loadings) associated with each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space'])  # Replace 'X.columns' with your feature names

# Display the top contributing features for each principal component
for i, pc in enumerate(loadings_df.index):
    print(f"Principal Component {i + 1}:")
    top_features = loadings_df.iloc[i].abs().sort_values(ascending=False).head(3)  # Top 3 features
    print(top_features)
    print()

Principal Component 1:
living_space      0.607793
price             0.514591
bedroom_number    0.487082
Name: 0, dtype: float64

Principal Component 2:
latitude     0.706909
longitude    0.698618
price        0.082365
Name: 1, dtype: float64

Principal Component 3:
price_per_unit    0.993402
price             0.074530
bedroom_number    0.063507
Name: 2, dtype: float64

