In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Read the data

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
real_state_data = pd.read_csv("raw_data_copy.csv")

# Review the DataFrame
real_state_data.head()

Unnamed: 0.1,Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,1,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,4,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,7,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,44,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,48,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [3]:
# Delete the unnamed column
real_state_data = real_state_data.drop('Unnamed: 0', axis=1)

# Display
real_state_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
0,249518113,"117 3rd St, Wrangell, AK 99929",3rd St,Wrangell,AK,56.474518,-132.386,99929.0,589500.0,3.0,3.0,237.0,2478.0,7492.0,sqft,SINGLE_FAMILY
1,249518139,"335 Cassiar St, Wrangell, AK 99929",Cassiar St,Wrangell,AK,56.475697,-132.38905,99929.0,405000.0,5.0,3.0,194.0,2080.0,10436.0,sqft,SINGLE_FAMILY
2,2069574197,"532.5 Front St, Wrangell, AK 99929",5 Front St,Wrangell,AK,56.468544,-132.37965,99929.0,130000.0,3.0,1.0,104.0,1250.0,3483.0,sqft,SINGLE_FAMILY
3,243013558,"501 Noseeum St, Petersburg, AK 99833",Noseeum St,Petersburg,AK,56.80682,-132.9618,99833.0,165000.0,1.0,1.0,235.0,700.0,15367.968,sqft,SINGLE_FAMILY
4,243013641,"905 Odin St, Petersburg, AK 99833",Odin St,Petersburg,AK,56.80623,-132.97113,99833.0,375000.0,3.0,3.0,226.0,1652.0,8000.0,sqft,SINGLE_FAMILY


In [14]:
texas_data = real_state_data[real_state_data['state'] == "TX"]
texas_data.head()

Unnamed: 0,property_id,address,street_name,city,state,latitude,longitude,postcode,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,land_space_unit,property_type
113190,27399441,"1116 Saint Johns Dr, El Paso, TX 79903",Saint Johns Dr,El Paso,TX,31.786737,-106.42802,79903.0,239500.0,5.0,3.0,141.0,1692.0,6969.6,sqft,SINGLE_FAMILY
113192,27498318,"1101 Apache St, El Paso, TX 79925",Apache St,El Paso,TX,31.784021,-106.40445,79925.0,165000.0,4.0,2.0,100.0,1650.0,12632.4,sqft,SINGLE_FAMILY
113193,27497333,"452 Val Verde St, El Paso, TX 79905",Val Verde St,El Paso,TX,31.761911,-106.43233,79905.0,118000.0,4.0,1.0,61.0,1918.0,11325.6,sqft,SINGLE_FAMILY
113195,27416021,"4600 Cumberland Cir, El Paso, TX 79903",Cumberland Cir,El Paso,TX,31.790165,-106.43596,79903.0,414700.0,4.0,3.0,132.0,3119.0,15246.0,sqft,SINGLE_FAMILY
113197,27397349,"6028 Aztec Rd, El Paso, TX 79925",Aztec Rd,El Paso,TX,31.785316,-106.41115,79925.0,174950.0,4.0,2.0,97.0,1800.0,6098.4,sqft,SINGLE_FAMILY


In [6]:
# Plot your data to see what's in your DataFrame
#texas_df.hvplot.line(width=800, height=400,rot=90)

In [15]:
# Verify the columns in data
texas_data.columns

Index(['property_id', 'address', 'street_name', 'city', 'state', 'latitude',
       'longitude', 'postcode', 'price', 'bedroom_number', 'bathroom_number',
       'price_per_unit', 'living_space', 'land_space', 'land_space_unit',
       'property_type'],
      dtype='object')

In [16]:
# Set index to property_id
texas_data = texas_data.set_index('property_id')

In [17]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
texas_data_scaled = StandardScaler().fit_transform(texas_data[['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']])

# should I have excluded the 'longitude' & 'latitude' columns or not?

In [18]:
# Create a DataFrame with the scaled data

# Copy the property id from the original data

df_texas_scaled = pd.DataFrame(
    texas_data_scaled,
    columns=['latitude',
       'longitude','price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space']
)

# Set the property_id column as index
df_texas_scaled["property_id"] = texas_data.index
df_texas_scaled = df_texas_scaled.set_index("property_id")

# Display sample data
df_texas_scaled.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27399441,0.551979,-4.73947,-0.386688,1.513535,0.246022,-0.070161,-0.019173,-0.022991
27498318,0.550422,-4.727441,-0.490839,0.549106,-0.580371,-0.105452,-0.020137,-0.022428
27497333,0.537746,-4.741669,-0.556546,0.549106,-1.406764,-0.139022,-0.013988,-0.022558
27416021,0.553944,-4.743522,-0.141757,0.549106,0.246022,-0.077908,0.013569,-0.022169
27397349,0.551164,-4.73086,-0.476929,0.549106,-0.580371,-0.108035,-0.016695,-0.023078


## Initialize the K-Means model

In [19]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [20]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_texas_scaled)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [21]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

Unnamed: 0,k,inertia
0,1,640144.0
1,2,564941.171223
2,3,489386.465857
3,4,420434.127772
4,5,366289.361798


In [22]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
elbow_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [76]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=6)

In [77]:
# Fit the K-Means model using the scaled data
model.fit(df_texas_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [78]:
# Predict the clusters to group the cryptocurrencies using the scaled data
clusters = model.predict(df_texas_scaled)

# Print the resulting array of cluster values.
print(clusters)

[1 1 1 ... 0 0 0]


In [79]:
# Create a copy of the DataFrame
df_texas_predictions = df_texas_scaled.copy()

In [80]:
# Add a new column to the DataFrame with the predicted clusters
df_texas_predictions["cluster"] = clusters

# Display sample data
df_texas_predictions.head()

Unnamed: 0_level_0,latitude,longitude,price,bedroom_number,bathroom_number,price_per_unit,living_space,land_space,cluster
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27399441,0.551979,-4.73947,-0.386688,1.513535,0.246022,-0.070161,-0.019173,-0.022991,1
27498318,0.550422,-4.727441,-0.490839,0.549106,-0.580371,-0.105452,-0.020137,-0.022428,1
27497333,0.537746,-4.741669,-0.556546,0.549106,-1.406764,-0.139022,-0.013988,-0.022558,1
27416021,0.553944,-4.743522,-0.141757,0.549106,0.246022,-0.077908,0.013569,-0.022169,1
27397349,0.551164,-4.73086,-0.476929,0.549106,-0.580371,-0.108035,-0.016695,-0.023078,1


In [49]:
# Create a scatter plot using hvPlot
texas_predictions_plot = df_texas_predictions.hvplot.scatter(
    x="price",
    y="land_space",
    by="cluster",
    hover_cols = "property_id"
)
texas_predictions_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Optimize Clusters with Principal Component Analysis

In [50]:
# Create a PCA model instance and set `n_components=3`.
pca=PCA(n_components=3)

In [51]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
texas_pca = pca.fit_transform(df_texas_scaled)

# View the first five rows of the DataFrame. 
texas_pca[:5]

array([[ 0.45974428,  3.80193403, -0.55356567],
       [-0.66448664,  3.73464495, -0.33894277],
       [-1.22930173,  3.75321085, -0.30564299],
       [ 0.04503059,  3.73754991, -0.31661583],
       [-0.65762434,  3.7377452 , -0.33839918]])

In [52]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.24236643, 0.14409447, 0.12907044])

In [53]:
# Create a new DataFrame with the PCA data.
df_texas_pca = pd.DataFrame(texas_pca,
                            columns=["PC1", "PC2", "PC3"])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_texas_pca["property_id"] = texas_data.index

# Set the property_id column as index

df_texas_pca = df_texas_pca.set_index("property_id")

# Display sample data
df_texas_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27399441,0.459744,3.801934,-0.553566
27498318,-0.664487,3.734645,-0.338943
27497333,-1.229302,3.753211,-0.305643
27416021,0.045031,3.73755,-0.316616
27397349,-0.657624,3.737745,-0.338399


In [54]:
# Create a list with the number of k-values from 1 to 11
k_2 = list(range(1, 11))

In [55]:
# Create an empty list to store the inertia values
inertia_2=[]

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data
# 3. Append the model.inertia_ to the inertia list

for i in k_2:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_texas_pca)
    inertia_2.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [56]:
# Create a dictionary with the data to plot the Elbow curve
elbow_pca = {"k": k_2, "inertia": inertia_2}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,330014.293938
1,2,256299.012806
2,3,198114.314934
3,4,149278.79861
4,5,121546.194475


In [57]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca_plot = df_elbow_pca.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve using PCA data", 
    xticks=k_2
)
elbow_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [58]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4)

In [59]:
# Fit the K-Means model using the PCA data
model.fit(df_texas_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [60]:
# Predict the clusters using the PCA data
k_3 = model.predict(df_texas_pca)
# Print the resulting array of cluster values.
k_3

array([1, 1, 1, ..., 1, 1, 1], dtype=int32)

In [61]:
# Create a copy of the DataFrame with the PCA data
df_texas_predictions_pca = df_texas_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_texas_predictions_pca["predicted_clusters"] = k_3

# Display sample data
df_texas_predictions_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3,predicted_clusters
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
27399441,0.459744,3.801934,-0.553566,1
27498318,-0.664487,3.734645,-0.338943,1
27497333,-1.229302,3.753211,-0.305643,1
27416021,0.045031,3.73755,-0.316616,1
27397349,-0.657624,3.737745,-0.338399,1


In [62]:
# Create a scatter plot using hvPlot
texas_predictions_pca_plot = df_texas_predictions_pca.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="predicted_clusters",
    hover_cols="property_id"
)
texas_predictions_pca_plot

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [64]:
from sklearn.decomposition import PCA

# Perform PCA on your dataset (replace X with your data)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_texas_scaled)

# Get the loadings (component loadings) associated with each principal component
loadings = pca.components_

# Create a DataFrame to display the loadings
loadings_df = pd.DataFrame(loadings, columns=['latitude',
       'longitude', 'price', 'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space', 'land_space'])  # Replace 'X.columns' with your feature names

# Display the top contributing features for each principal component
for i, pc in enumerate(loadings_df.index):
    print(f"Principal Component {i + 1}:")
    top_features = loadings_df.iloc[i].abs().sort_values(ascending=False).head(3)  # Top 3 features
    print(top_features)
    print()

Principal Component 1:
bathroom_number    0.638790
bedroom_number     0.561306
price              0.516519
Name: 0, dtype: float64

Principal Component 2:
latitude          0.707409
longitude         0.699449
bedroom_number    0.077110
Name: 1, dtype: float64

Principal Component 3:
price_per_unit    0.788763
land_space        0.533233
price             0.208419
Name: 2, dtype: float64



In [66]:
real_state_data["state"].value_counts()

state
TX    80018
CA    66189
AZ    21623
IL    19874
WA    19448
MO    17871
CO    15608
OR    12291
OK    11746
NV    10006
LA     9388
UT     7706
KS     7602
AR     6935
ID     6751
NE     5412
NM     5115
HI     3850
AK     2173
WY     1528
IA      183
CT        1
Name: count, dtype: int64