In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Read the data

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
real_state_data = pd.read_csv("clean_data_outlier1.csv")

# Review the DataFrame
real_state_data.head()

Unnamed: 0.1,Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,0,318990176,"649 Hargraves Avenue NE, Royal City, WA 99357",Hargraves Avenue NE,Royal City,WA,46.906307,-119.622284,99357.0,399900.0,4.0,3.0,229.0,1742.0,17193.132,sqft,SINGLE_FAMILY
1,1,204997510,"15822 S Clear View Loop, Kennewick, WA 99338",S Clear View Loop,Kennewick,WA,46.19008,-119.31846,99338.0,799000.0,4.0,3.0,306.0,2611.0,26893.944,sqft,SINGLE_FAMILY
2,2,2064111934,"3602 3602/3600 W 15th Ave, Kennewick, WA 99338",3602/3600 W 15th Ave,Kennewick,WA,46.19497,-119.19278,99338.0,389000.0,4.0,2.0,167.0,2328.0,15681.6,sqft,MULTI_FAMILY
3,3,331355531,"6087 W 34th Ave, Kennewick, WA 99338",W 34th Ave,Kennewick,WA,46.178677,-119.20451,99338.0,525000.0,4.0,3.0,220.0,2380.0,9583.0,sqft,SINGLE_FAMILY
4,4,85939771,"1506 S Olson St, Kennewick, WA 99338",S Olson St,Kennewick,WA,46.194546,-119.17232,99338.0,350000.0,3.0,2.0,228.0,1529.0,12632.4,sqft,SINGLE_FAMILY


In [3]:
# Delete the unnamed column
real_state_data = real_state_data.drop('Unnamed: 0', axis=1)

# Display
real_state_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,318990176,"649 Hargraves Avenue NE, Royal City, WA 99357",Hargraves Avenue NE,Royal City,WA,46.906307,-119.622284,99357.0,399900.0,4.0,3.0,229.0,1742.0,17193.132,sqft,SINGLE_FAMILY
1,204997510,"15822 S Clear View Loop, Kennewick, WA 99338",S Clear View Loop,Kennewick,WA,46.19008,-119.31846,99338.0,799000.0,4.0,3.0,306.0,2611.0,26893.944,sqft,SINGLE_FAMILY
2,2064111934,"3602 3602/3600 W 15th Ave, Kennewick, WA 99338",3602/3600 W 15th Ave,Kennewick,WA,46.19497,-119.19278,99338.0,389000.0,4.0,2.0,167.0,2328.0,15681.6,sqft,MULTI_FAMILY
3,331355531,"6087 W 34th Ave, Kennewick, WA 99338",W 34th Ave,Kennewick,WA,46.178677,-119.20451,99338.0,525000.0,4.0,3.0,220.0,2380.0,9583.0,sqft,SINGLE_FAMILY
4,85939771,"1506 S Olson St, Kennewick, WA 99338",S Olson St,Kennewick,WA,46.194546,-119.17232,99338.0,350000.0,3.0,2.0,228.0,1529.0,12632.4,sqft,SINGLE_FAMILY


In [4]:
california_data = real_state_data[real_state_data['state'] == "CA"]
california_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
15712,124738796,"11647 Henness Rd, Truckee, CA 96161",Henness Rd,Truckee,CA,39.34353,-120.16769,96161.0,2350000.0,4.0,3.0,896.0,2622.0,26136.0,sqft,SINGLE_FAMILY
15713,19456203,"12337 Saint Bernard Dr, Truckee, CA 96161",Saint Bernard Dr,Truckee,CA,39.353893,-120.21115,96161.0,995000.0,3.0,3.0,577.0,1722.0,14374.8,sqft,SINGLE_FAMILY
15714,205278378,"12721 Granite Dr, Truckee, CA 96161",Granite Dr,Truckee,CA,39.339485,-120.190155,96161.0,2800000.0,4.0,6.0,821.0,3410.0,28749.6,sqft,SINGLE_FAMILY
15715,114451084,"10615 Sara Bear Ln, Truckee, CA 96161",Sara Bear Ln,Truckee,CA,39.32298,-120.174446,96161.0,1275000.0,3.0,3.0,747.0,1706.0,5227.2,sqft,SINGLE_FAMILY
15716,19434659,"11553 E Ridge Rd, Truckee, CA 96161",E Ridge Rd,Truckee,CA,39.33503,-120.15787,96161.0,949900.0,5.0,3.0,443.0,2142.0,10018.8,sqft,SINGLE_FAMILY


In [5]:
# Plot your data to see what's in your DataFrame
#texas_df.hvplot.line(width=800, height=400,rot=90)

In [6]:
# Set index to property_id
california_data = california_data.set_index('property_id')

In [7]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
california_data_scaled = StandardScaler().fit_transform(california_data[['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']])

In [8]:
# Create a DataFrame with the scaled data

# Copy the property id from the original data

df_california_scaled = pd.DataFrame(
    california_data_scaled,
    columns=['latitude',
       'longitude','price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']
)

# Set the property_id column as index
df_california_scaled["property_id"] = california_data.index
df_california_scaled = df_california_scaled.set_index("property_id")

# Display sample data
df_california_scaled.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
124738796,1.8506,-0.493047,0.94907,0.488156,0.06957,1.073751,0.652785,2.845521
19456203,1.855596,-0.514899,-0.028808,-0.350978,0.06957,0.196275,-0.241806,0.934327
205278378,1.84865,-0.504343,1.273826,0.488156,0.605676,0.867448,1.43605,3.270231
114451084,1.840694,-0.496444,0.173262,-0.350978,0.06957,0.663895,-0.25771,-0.552157
19434659,1.846503,-0.488109,-0.061356,1.32729,0.06957,-0.172321,0.17567,0.226478


## Initialize the K-Means model

In [9]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [10]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_california_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,433336.0
1,2,341081.509834
2,3,282229.849908
3,4,230941.859144
4,5,199549.607882


In [12]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
elbow_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [13]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [14]:
# Fit the K-Means model using the scaled data
model.fit(df_california_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_california_scaled)

# Print the resulting array of cluster values.
print(clusters)

[0 0 2 ... 1 1 1]


In [16]:
# Create a copy of the DataFrame
df_california_predictions = df_california_scaled.copy()

In [17]:
# Add a new column to the DataFrame with the predicted clusters
df_california_predictions["cluster"] = clusters

# Display sample data
df_california_predictions.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,cluster
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
124738796,1.8506,-0.493047,0.94907,0.488156,0.06957,1.073751,0.652785,2.845521,0
19456203,1.855596,-0.514899,-0.028808,-0.350978,0.06957,0.196275,-0.241806,0.934327,0
205278378,1.84865,-0.504343,1.273826,0.488156,0.605676,0.867448,1.43605,3.270231,2
114451084,1.840694,-0.496444,0.173262,-0.350978,0.06957,0.663895,-0.25771,-0.552157,0
19434659,1.846503,-0.488109,-0.061356,1.32729,0.06957,-0.172321,0.17567,0.226478,0


In [18]:
# Create a scatter plot using hvPlot
california_predictions_plot = df_california_predictions.hvplot.scatter(
    x="price",
    y="living_space",
    by="cluster",
    hover_cols = "property_id",
    xlabel = "Price",
    ylabel = "Living Space",
    title = "Texas"
)
california_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [19]:
# Create a scatter plot using hvPlot
california_predictions_plot = df_california_predictions.hvplot.scatter(
    x="price",
    y="bedroom_number",
    by="cluster",
    hover_cols = "property_id",
    xlabel = "Price",
    ylabel = "Bedroom Number",
    title = "California"
)
california_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Optimize Clusters with Principal Component Analysis

In [20]:
# Create a PCA model instance and set `n_components=3`.
pca=PCA(n_components=3)

In [21]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
california_pca = pca.fit_transform(df_california_scaled)

# View the first five rows of the DataFrame. 
california_pca[:5]

array([[ 1.42193634,  2.26439749,  0.18999731],
       [-0.59303569,  1.58367898,  0.03841823],
       [ 2.10984591,  2.48203598,  0.65242796],
       [-0.6431152 ,  1.56800252, -0.67496918],
       [ 0.02910942,  1.6748833 ,  1.07317728]])

In [22]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.30039775, 0.23283483, 0.17016009])

In [23]:
# Create a new DataFrame with the PCA data.
df_california_pca = pd.DataFrame(california_pca,
                            columns=["PC1", "PC2", "PC3"])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_california_pca["property_id"] = california_data.index

# Set the property_id column as index

df_california_pca = df_california_pca.set_index("property_id")

# Display sample data
df_california_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
124738796,1.421936,2.264397,0.189997
19456203,-0.593036,1.583679,0.038418
205278378,2.109846,2.482036,0.652428
114451084,-0.643115,1.568003,-0.674969
19434659,0.029109,1.674883,1.073177


In [24]:
# Create a list with the number of k-values from 1 to 11
k_2 = list(range(1, 11))

In [25]:
# Create an empty list to store the inertia values
inertia_2=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k_2:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_california_pca)
    inertia_2.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [26]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca = {"k": k_2, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,304805.365485
1,2,212617.658353
2,3,154030.009573
3,4,123991.254049
4,5,108573.657125


In [27]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve using PCA data", 
    xticks=k_2
)
elbow_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [34]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [35]:
# Fit the K-Means model using the PCA data
model.fit(df_california_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [36]:
# Predict the clusters using the PCA data
k_3 = model.predict(df_california_pca)
# Print the resulting array of cluster values.
k_3

array([3, 3, 1, ..., 0, 1, 0], dtype=int32)

In [37]:
# Create a copy of the DataFrame with the PCA data
df_california_predictions_pca = df_california_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_california_predictions_pca["predicted_clusters"] = k_3

# Display sample data
df_california_predictions_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
124738796,1.421936,2.264397,0.189997,3
19456203,-0.593036,1.583679,0.038418,3
205278378,2.109846,2.482036,0.652428,1
114451084,-0.643115,1.568003,-0.674969,3
19434659,0.029109,1.674883,1.073177,3


In [38]:
# Create a scatter plot using hvPlot
california_predictions_pca_plot = df_california_predictions_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_clusters",
    hover_cols="property_id"
)
california_predictions_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [39]:
from sklearn.decomposition import PCA

# Perform PCA on your dataset
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_california_scaled)

# Get the loadings (component loadings) associated with each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space'])

# Display the top contributing features for each principal component
for i, pc in enumerate(loadings_df.index):
    print(f"Principal Component {i + 1}:")
    features = loadings_df.iloc[i].abs().sort_values(ascending=False).head(3)
    print(features)
    print()

Principal Component 1:
price             0.527907
living_space      0.524783
bedroom_number    0.405158
Name: 0, dtype: float64

Principal Component 2:
longitude    0.682264
latitude     0.652703
price        0.206121
Name: 1, dtype: float64

Principal Component 3:
price_per_unit    0.674332
bedroom_number    0.472950
price             0.357857
Name: 2, dtype: float64

